Search:

CWIS Developers Documentation

  • Main Page
  • Classes
  • Files
  • File List
  • File Members

SearchEngine.php

Go to the documentation of this file.
00001 <?PHP
00002 
00003 #
00004 #   FILE:  SearchEngine.php
00005 #
00006 #   Open Source Metadata Archive Search Engine (OSMASE)
00007 #   Copyright 2002-2011 Edward Almasy and Internet Scout
00008 #   http://scout.wisc.edu
00009 #
00010 
00011 class SearchEngine {
00012 
00013     # ---- PUBLIC INTERFACE --------------------------------------------------
00014 
00015     # possible types of logical operators
00016     const LOGIC_AND = 1;
00017     const LOGIC_OR = 2;
00018 
00019     # flags used for indicating field types
00020     const FIELDTYPE_TEXT = 1;
00021     const FIELDTYPE_NUMERIC = 2;
00022     const FIELDTYPE_DATE = 3;
00023     const FIELDTYPE_DATERANGE = 4;
00024 
00025     # object constructor
00026     function SearchEngine(&$DB, $ItemTableName, $ItemIdFieldName)
00027     {
00028         # save database object for our use
00029         $this->DB = $DB;
00030 
00031         # save item access parameters
00032         $this->ItemTableName = $ItemTableName;
00033         $this->ItemIdFieldName = $ItemIdFieldName;
00034 
00035         # define flags used for indicating word states
00036         if (!defined("WORD_PRESENT"))  {  define("WORD_PRESENT", 1);  }
00037         if (!defined("WORD_EXCLUDED")) {  define("WORD_EXCLUDED", 2);  }
00038         if (!defined("WORD_REQUIRED")) {  define("WORD_REQUIRED", 4);  }
00039 
00040         # set default debug state
00041         $this->DebugLevel = 0;
00042     }
00043 
00044     # add field to be searched
00045     function AddField(
00046             $FieldName, $DBFieldName, $FieldType, $Weight, $UsedInKeywordSearch)
00047     {
00048         # save values
00049         $this->FieldInfo[$FieldName]["DBFieldName"] = $DBFieldName;
00050         $this->FieldInfo[$FieldName]["FieldType"] = $FieldType;
00051         $this->FieldInfo[$FieldName]["Weight"] = $Weight;
00052         $this->FieldInfo[$FieldName]["InKeywordSearch"] = $UsedInKeywordSearch;
00053     }
00054 
00055     # retrieve info about tables and fields (useful for child objects)
00056     function ItemTableName() {  return $this->ItemTableName;  }
00057     function ItemIdFieldName() {  return $this->ItemIdFieldName;  }
00058     function DBFieldName($FieldName)
00059             {  return $this->FieldInfo[$FieldName]["DBFieldName"];  }
00060     function FieldType($FieldName)
00061             {  return $this->FieldInfo[$FieldName]["FieldType"];  }
00062     function FieldWeight($FieldName)
00063             {  return $this->FieldInfo[$FieldName]["Weight"];  }
00064     function FieldInKeywordSearch($FieldName)
00065             {  return $this->FieldInfo[$FieldName]["InKeywordSearch"];  }
00066 
00067     # set debug level
00068     function DebugLevel($Setting)
00069     {
00070         $this->DebugLevel = $Setting;
00071     }
00072 
00073 
00074     # ---- search functions
00075 
00076     # perform keyword search
00077     function Search($SearchString, $StartingResult = 0, $NumberOfResults = 10,
00078             $SortByField = NULL, $SortDescending = TRUE)
00079     {
00080         $SearchString = $this->SetDebugLevel($SearchString);
00081         $this->DMsg(0, "In Search() with search string \"".$SearchString."\"");
00082 
00083         # save start time to use in calculating search time
00084         $StartTime = microtime(TRUE);
00085 
00086         # clear word counts
00087         $this->InclusiveTermCount = 0;
00088         $this->RequiredTermCount = 0;
00089         $this->ExcludedTermCount = 0;
00090 
00091         # parse search string into terms
00092         $Words = $this->ParseSearchStringForWords($SearchString);
00093         $this->DMsg(1, "Found ".count($Words)." words");
00094 
00095         # parse search string for phrases
00096         $Phrases = $this->ParseSearchStringForPhrases($SearchString);
00097         $this->DMsg(1, "Found ".count($Phrases)." phrases");
00098 
00099         # if only excluded terms specified
00100         if ($this->ExcludedTermCount && !$this->InclusiveTermCount)
00101         {
00102             # load all records
00103             $this->DMsg(1, "Loading all records");
00104             $Scores = $this->LoadScoresForAllRecords();
00105         }
00106         else
00107         {
00108             # perform searches
00109             $Scores = $this->SearchForWords($Words);
00110             $this->DMsg(1, "Found ".count($Scores)." results after word search");
00111             $Scores = $this->SearchForPhrases($Phrases, $Scores);
00112             $this->DMsg(1, "Found ".count($Scores)." results after phrase search");
00113         }
00114 
00115         # if search results found
00116         if (count($Scores) > 0)
00117         {
00118             # handle any excluded words
00119             $Scores = $this->FilterOnExcludedWords($Words, $Scores);
00120 
00121             # strip off any results that don't contain required words
00122             $Scores = $this->FilterOnRequiredWords($Scores);
00123         }
00124 
00125         # count, sort, and trim search result scores list
00126         $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
00127                 $SortByField, $SortDescending);
00128 
00129         # record search time
00130         $this->LastSearchTime = microtime(TRUE) - $StartTime;
00131 
00132         # return list of items to caller
00133         $this->DMsg(0, "Ended up with ".$this->NumberOfResultsAvailable." results");
00134         return $Scores;
00135     }
00136 
00137     # perform search across multiple fields and return trimmed results to caller
00138     function FieldedSearch($SearchStrings, $StartingResult = 0, $NumberOfResults = 10,
00139             $SortByField = NULL, $SortDescending = TRUE)
00140     {
00141         $SearchStrings = $this->SetDebugLevel($SearchStrings);
00142         $this->DMsg(0, "In FieldedSearch() with "
00143                 .count($SearchStrings)." search strings");
00144 
00145         # save start time to use in calculating search time
00146         $StartTime = microtime(TRUE);
00147 
00148         # perform search
00149         $Scores = $this->SearchAcrossFields($SearchStrings);
00150         $Scores = ($Scores === NULL) ? array() : $Scores;
00151 
00152         # count, sort, and trim search result scores list
00153         $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
00154                 $SortByField, $SortDescending);
00155 
00156         # record search time
00157         $this->LastSearchTime = microtime(TRUE) - $StartTime;
00158 
00159         # return list of items to caller
00160         $this->DMsg(0, "Ended up with ".$this->NumberOfResultsAvailable." results");
00161         return $Scores;
00162     }
00163 
00164     # perform search with logical groups of fielded searches
00165     function GroupedSearch($SearchGroups, $StartingResult = 0, $NumberOfResults = 10,
00166             $SortByField = NULL, $SortDescending = TRUE)
00167     {
00168         foreach ($SearchGroups as $Index => $Groups)
00169         {
00170             if (isset($SearchGroups[$Index]["SearchStrings"]))
00171             {
00172                 $SearchGroups[$Index]["SearchStrings"] =
00173                         $this->SetDebugLevel($SearchGroups[$Index]["SearchStrings"]);
00174             }
00175         }
00176         $this->DMsg(0, "In GroupedSearch() with "
00177                 .count($SearchGroups)." search groups");
00178 
00179         # save start time to use in calculating search time
00180         $StartTime = microtime(TRUE);
00181 
00182         # start with no results
00183         $Scores = array();
00184 
00185         # save AND/OR search setting
00186         $SavedSearchLogic = $this->DefaultSearchLogic;
00187 
00188         # for each search group
00189         $FirstSearch = TRUE;
00190         foreach ($SearchGroups as $Group)
00191         {
00192             $this->DMsg(0, "----- GROUP ---------------------------");
00193 
00194             # if group has AND/OR setting specified
00195             if (isset($Group["Logic"]))
00196             {
00197                 # use specified AND/OR setting
00198                 $this->DefaultSearchLogic = $Group["Logic"];
00199             }
00200             else
00201             {
00202                 # use saved AND/OR setting
00203                 $this->DefaultSearchLogic = $SavedSearchLogic;
00204             }
00205             $this->DMsg(2, "Logic is "
00206                     .(($this->DefaultSearchLogic == self::LOGIC_AND) ? "AND" : "OR"));
00207 
00208             # if we have search strings for this group
00209             if (isset($Group["SearchStrings"]))
00210             {
00211                 # perform search
00212                 $GroupScores = $this->SearchAcrossFields($Group["SearchStrings"]);
00213 
00214                 # if search was conducted
00215                 if ($GroupScores !== NULL)
00216                 {
00217                     # if saved AND/OR setting is OR or this is first search
00218                     if (($SavedSearchLogic == self::LOGIC_OR) || $FirstSearch)
00219                     {
00220                         # add search results to result list
00221                         foreach ($GroupScores as $ItemId => $Score)
00222                         {
00223                             if (isset($Scores[$ItemId]))
00224                             {
00225                                 $Scores[$ItemId] += $Score;
00226                             }
00227                             else
00228                             {
00229                                 $Scores[$ItemId] = $Score;
00230                             }
00231                         }
00232 
00233                         # (reset flag indicating first search)
00234                         $FirstSearch = FALSE;
00235                     }
00236                     else
00237                     {
00238                         # AND search results with previous results
00239                         $OldScores = $Scores;
00240                         $Scores = array();
00241                         foreach ($GroupScores as $ItemId => $Score)
00242                         {
00243                             if (isset($OldScores[$ItemId]))
00244                             {
00245                                 $Scores[$ItemId] = $OldScores[$ItemId] + $Score;
00246                             }
00247                         }
00248                     }
00249                 }
00250             }
00251         }
00252 
00253         # restore AND/OR search setting
00254         $this->DefaultSearchLogic = $SavedSearchLogic;
00255 
00256         # count, sort, and trim search result scores list
00257         $Scores = $this->CleanScores($Scores, $StartingResult, $NumberOfResults,
00258                 $SortByField, $SortDescending);
00259 
00260         # record search time
00261         $this->LastSearchTime = microtime(TRUE) - $StartTime;
00262 
00263         # return search results to caller
00264         $this->DMsg(0, "Ended up with ".$this->NumberOfResultsAvailable." results");
00265         return $Scores;
00266     }
00267 
00268     # add function that will be called to filter search results
00269     function AddResultFilterFunction($FunctionName)
00270     {
00271         # save filter function name
00272         $this->FilterFuncs[] = $FunctionName;
00273     }
00274 
00275     # get or set default search logic (AND or OR)
00276     function DefaultSearchLogic($NewSetting = NULL)
00277     {
00278         if ($NewSetting != NULL)
00279         {
00280             $this->DefaultSearchLogic = $NewSetting;
00281         }
00282         return $this->DefaultSearchLogic;
00283     }
00284 
00285     function SearchTermsRequiredByDefault($NewSetting = TRUE)
00286     {
00287         if ($NewSetting)
00288         {
00289             $this->DefaultSearchLogic = self::LOGIC_AND;
00290         }
00291         else
00292         {
00293             $this->DefaultSearchLogic = self::LOGIC_OR;
00294         }
00295     }
00296 
00297     function NumberOfResults()
00298     {
00299         return $this->NumberOfResultsAvailable;
00300     }
00301 
00302     function SearchTerms()
00303     {
00304         return $this->SearchTermList;
00305     }
00306 
00307     function SearchTime()
00308     {
00309         return $this->LastSearchTime;
00310     }
00311 
00312     # report total weight for all fields involved in search
00313     function FieldedSearchWeightScale($SearchStrings)
00314     {
00315         $Weight = 0;
00316         $IncludedKeywordSearch = FALSE;
00317         foreach ($SearchStrings as $FieldName => $SearchStringArray)
00318         {
00319             if ($FieldName == "XXXKeywordXXX")
00320             {
00321                 $IncludedKeywordSearch = TRUE;
00322             }
00323             else
00324             {
00325                 $Weight += $this->FieldInfo[$FieldName]["Weight"];
00326             }
00327         }
00328         if ($IncludedKeywordSearch)
00329         {
00330             foreach ($this->FieldInfo as $FieldName => $Info)
00331             {
00332                 if ($Info["InKeywordSearch"])
00333                 {
00334                     $Weight += $Info["Weight"];
00335                 }
00336             }
00337         }
00338         return $Weight;
00339     }
00340 
00341 
00342     # ---- search database update functions
00343 
00344     # update search DB for the specified item
00345     function UpdateForItem($ItemId)
00346     {
00347         # bail out if item ID is negative (indicating a temporary record)
00348         if ($ItemId < 0) {  return;  }
00349 
00350         # clear word count added flags for this item
00351         unset($this->WordCountAdded);
00352 
00353         # delete any existing info for this item
00354         $this->DB->Query("DELETE FROM SearchWordCounts WHERE ItemId = ".$ItemId);
00355 
00356         # for each metadata field
00357         foreach ($this->FieldInfo as $FieldName => $Info)
00358         {
00359             # if search weight for field is positive
00360             if ($Info["Weight"] > 0)
00361             {
00362                 # retrieve text for field
00363                 $Text = $this->GetFieldContent($ItemId, $FieldName);
00364 
00365                 # if text is array
00366                 if (is_array($Text))
00367                 {
00368                     # for each text string in array
00369                     foreach ($Text as $String)
00370                     {
00371                         # record search info for text
00372                         $this->RecordSearchInfoForText($ItemId, $FieldName,
00373                                                        $Info["Weight"], $String,
00374                                                        $Info["InKeywordSearch"]);
00375                     }
00376                 }
00377                 else
00378                 {
00379                     # record search info for text
00380                     $this->RecordSearchInfoForText($ItemId, $FieldName,
00381                                                    $Info["Weight"], $Text,
00382                                                    $Info["InKeywordSearch"]);
00383                 }
00384             }
00385         }
00386     }
00387 
00388     # update search DB for the specified range of items
00389     function UpdateForItems($StartingItemId, $NumberOfItems)
00390     {
00391         # retrieve IDs for specified number of items starting at specified ID
00392         $this->DB->Query("SELECT ".$this->ItemIdFieldName." FROM ".$this->ItemTableName
00393                 ." WHERE ".$this->ItemIdFieldName." >= ".$StartingItemId
00394                 ." ORDER BY ".$this->ItemIdFieldName." LIMIT ".$NumberOfItems);
00395         $ItemIds = $this->DB->FetchColumn($this->ItemIdFieldName);
00396 
00397         # for each retrieved item ID
00398         foreach ($ItemIds as $ItemId)
00399         {
00400             # update search info for item
00401             $this->UpdateForItem($ItemId);
00402         }
00403 
00404         # return ID of last item updated to caller
00405         return $ItemId;
00406     }
00407 
00408     # drop all data pertaining to item from search DB
00409     function DropItem($ItemId)
00410     {
00411         # drop all entries pertaining to item from word count table
00412         $this->DB->Query("DELETE FROM SearchWordCounts WHERE ItemId = ".$ItemId);
00413     }
00414 
00415     # drop all data pertaining to field from search DB
00416     function DropField($FieldName)
00417     {
00418         # retrieve our ID for field
00419         $FieldId = $this->DB->Query("SELECT FieldId FROM SearchFields "
00420                 ."WHERE FieldName = '".addslashes($FieldName)."'", "FieldId");
00421 
00422         # drop all entries pertaining to field from word counts table
00423         $this->DB->Query("DELETE FROM SearchWordCounts WHERE FieldId = \'".$FieldId."\'");
00424 
00425         # drop field from our fields table
00426         $this->DB->Query("DELETE FROM SearchFields WHERE FieldId = \'".$FieldId."\'");
00427     }
00428 
00429     # return total number of terms indexed by search engine
00430     function SearchTermCount()
00431     {
00432         return $this->DB->Query("SELECT COUNT(*) AS TermCount"
00433                 ." FROM SearchWords", "TermCount");
00434     }
00435 
00436     # return total number of items indexed by search engine
00437     function ItemCount()
00438     {
00439         return $this->DB->Query("SELECT COUNT(DISTINCT ItemId) AS ItemCount"
00440                 ." FROM SearchWordCounts", "ItemCount");
00441     }
00442 
00449     function AddSynonyms($Word, $Synonyms)
00450     {
00451         # asssume no synonyms will be added
00452         $AddCount = 0;
00453 
00454         # get ID for word
00455         $WordId = $this->GetWordId($Word, TRUE);
00456 
00457         # for each synonym passed in
00458         foreach ($Synonyms as $Synonym)
00459         {
00460             # get ID for synonym
00461             $SynonymId = $this->GetWordId($Synonym, TRUE);
00462 
00463             # if synonym is not already in database
00464             $this->DB->Query("SELECT * FROM SearchWordSynonyms"
00465                     ." WHERE (WordIdA = ".$WordId
00466                         ." AND WordIdB = ".$SynonymId.")"
00467                     ." OR (WordIdB = ".$WordId
00468                         ." AND WordIdA = ".$SynonymId.")");
00469             if ($this->DB->NumRowsSelected() == 0)
00470             {
00471                 # add synonym entry to database
00472                 $this->DB->Query("INSERT INTO SearchWordSynonyms"
00473                         ." (WordIdA, WordIdB)"
00474                         ." VALUES (".$WordId.", ".$SynonymId.")");
00475                 $AddCount++;
00476             }
00477         }
00478 
00479         # report to caller number of new synonyms added
00480         return $AddCount;
00481     }
00482 
00483     # remove synonym(s)
00484     function RemoveSynonyms($Word, $Synonyms = NULL)
00485     {
00486         # find ID for word
00487         $WordId = $this->GetWordId($Word);
00488 
00489         # if ID found
00490         if ($WordId !== NULL)
00491         {
00492             # if no specific synonyms provided
00493             if ($Synonyms === NULL)
00494             {
00495                 # remove all synonyms for word
00496                 $this->DB->Query("DELETE FROM SearchWordSynonyms"
00497                         ." WHERE WordIdA = '".$WordId."'"
00498                         ." OR WordIdB = '".$WordId."'");
00499             }
00500             else
00501             {
00502                 # for each specified synonym
00503                 foreach ($Synonyms as $Synonym)
00504                 {
00505                     # look up ID for synonym
00506                     $SynonymId = $this->GetWordId($Synonym);
00507 
00508                     # if synonym ID was found
00509                     if ($SynonymId !== NULL)
00510                     {
00511                         # delete synonym entry
00512                         $this->DB->Query("DELETE FROM SearchWordSynonyms"
00513                                 ." WHERE (WordIdA = '".$WordId."'"
00514                                     ." AND WordIdB = '".$SynonymId."')"
00515                                 ." OR (WordIdB = '".$WordId."'"
00516                                     ." AND WordIdA = '".$SynonymId."')");
00517                     }
00518                 }
00519             }
00520         }
00521     }
00522 
00523     # remove all synonyms
00524     function RemoveAllSynonyms()
00525     {
00526         $this->DB->Query("DELETE FROM SearchWordSynonyms");
00527     }
00528 
00529     # get synonyms for word (returns array of synonyms)
00530     function GetSynonyms($Word)
00531     {
00532         # assume no synonyms will be found
00533         $Synonyms = array();
00534 
00535         # look up ID for word
00536         $WordId = $this->GetWordId($Word);
00537 
00538         # if word ID was found
00539         if ($WordId !== NULL)
00540         {
00541             # look up IDs of all synonyms for this word
00542             $this->DB->Query("SELECT WordIdA, WordIdB FROM SearchWordSynonyms"
00543                     ." WHERE WordIdA = ".$WordId
00544                     ." OR WordIdB = ".$WordId);
00545             $SynonymIds = array();
00546             while ($Record = $this->DB->FetchRow)
00547             {
00548                 $SynonymIds[] = ($Record["WordIdA"] == $WordId)
00549                         ? $Record["WordIdB"] : $Record["WordIdA"];
00550             }
00551 
00552             # for each synonym ID
00553             foreach ($SynonymIds as $SynonymId)
00554             {
00555                 # look up synonym word and add to synonym list
00556                 $Synonyms[] = $this->GetWord($SynonymId);
00557             }
00558         }
00559 
00560         # return synonyms to caller
00561         return $Synonyms;
00562     }
00563 
00564     # get all synonyms (returns 2D array w/ words as first index)
00565     function GetAllSynonyms()
00566     {
00567         # assume no synonyms will be found
00568         $SynonymList = array();
00569 
00570         # for each synonym ID pair
00571         $OurDB = new SPTDatabase();
00572         $OurDB->Query("SELECT WordIdA, WordIdB FROM SearchWordSynonyms");
00573         while ($Record = $OurDB->FetchRow())
00574         {
00575             # look up words
00576             $Word = $this->GetWord($Record["WordIdA"]);
00577             $Synonym = $this->GetWord($Record["WordIdB"]);
00578 
00579             # if we do not already have an entry for the word
00580             #       or synonym is not listed for this word
00581             if (!isset($SynonymList[$Word])
00582                     || !in_array($Synonym, $SynonymList[$Word]))
00583             {
00584                 # add entry for synonym
00585                 $SynonymList[$Word][] = $Synonym;
00586             }
00587 
00588             # if we do not already have an entry for the synonym
00589             #       or word is not listed for this synonym
00590             if (!isset($SynonymList[$Synonym])
00591                     || !in_array($Word, $SynonymList[$Synonym]))
00592             {
00593                 # add entry for word
00594                 $SynonymList[$Synonym][] = $Word;
00595             }
00596         }
00597 
00598         # for each word
00599         # (this loop removes reciprocal duplicates)
00600         foreach ($SynonymList as $Word => $Synonyms)
00601         {
00602             # for each synonym for that word
00603             foreach ($Synonyms as $Synonym)
00604             {
00605                 # if synonym has synonyms and word is one of them
00606                 if (isset($SynonymList[$Synonym])
00607                         && isset($SynonymList[$Word])
00608                         && in_array($Word, $SynonymList[$Synonym])
00609                         && in_array($Synonym, $SynonymList[$Word]))
00610                 {
00611                     # if word has less synonyms than synonym
00612                     if (count($SynonymList[$Word])
00613                             < count($SynonymList[$Synonym]))
00614                     {
00615                         # remove synonym from synonym list for word
00616                         $SynonymList[$Word] = array_diff(
00617                                 $SynonymList[$Word], array($Synonym));
00618 
00619                         # if no synonyms left for word
00620                         if (!count($SynonymList[$Word]))
00621                         {
00622                             # remove empty synonym list for word
00623                             unset($SynonymList[$Word]);
00624                         }
00625                     }
00626                     else
00627                     {
00628                         # remove word from synonym list for synonym
00629                         $SynonymList[$Synonym] = array_diff(
00630                                 $SynonymList[$Synonym], array($Word));
00631 
00632                         # if no synonyms left for word
00633                         if (!count($SynonymList[$Synonym]))
00634                         {
00635                             # remove empty synonym list for word
00636                             unset($SynonymList[$Synonym]);
00637                         }
00638                     }
00639                 }
00640             }
00641         }
00642 
00643         # sort array alphabetically (just for convenience)
00644         foreach ($SynonymList as $Word => $Synonyms)
00645         {
00646             asort($SynonymList[$Word]);
00647         }
00648         ksort($SynonymList);
00649 
00650         # return 2D array of synonyms to caller
00651         return $SynonymList;
00652     }
00653 
00654     # set all synonyms (accepts 2D array w/ words as first index)
00655     function SetAllSynonyms($SynonymList)
00656     {
00657         # remove all existing synonyms
00658         $this->RemoveAllSynonyms();
00659 
00660         # for each synonym entry passed in
00661         foreach ($SynonymList as $Word => $Synonyms)
00662         {
00663             # add synonyms for word
00664             $this->AddSynonyms($Word, $Synonyms);
00665         }
00666     }
00667 
00676     function LoadSynonymsFromFile($FileName)
00677     {
00678         # asssume no synonyms will be added
00679         $AddCount = 0;
00680 
00681         # read in contents of file
00682         $Lines = file($FileName, FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
00683 
00684         # if file contained lines
00685         if (count($Lines))
00686         {
00687             # for each line of file
00688             foreach ($Lines as $Line)
00689             {
00690                 # if line is not a comment
00691                 if (!preg_match("/[\s]*#/", $Line))
00692                 {
00693                     # split line into words
00694                     $Words = preg_split("/[\s,]+/", $Line);
00695 
00696                     # if synonyms found
00697                     if (count($Words) > 1)
00698                     {
00699                         # separate out word and synonyms
00700                         $Word = array_shift($Words);
00701 
00702                         # add synonyms
00703                         $AddCount += $this->AddSynonyms($Word, $Words);
00704                     }
00705                 }
00706             }
00707         }
00708 
00709         # return count of synonyms added to caller
00710         return $AddCount;
00711     }
00712 
00713     # suggest alternatives
00714     function SuggestAlternateSearches($SearchString)
00715     {
00716         #
00717     }
00718 
00719 
00720     # ---- PRIVATE INTERFACE -------------------------------------------------
00721 
00722     protected $DB;
00723     protected $DebugLevel;
00724     protected $ItemTableName;
00725     protected $ItemIdFieldName;
00726     protected $NumberOfResultsAvailable;
00727     protected $LastSearchTime;
00728     protected $FilterFuncs;
00729     protected $DefaultSearchLogic = self::LOGIC_AND;
00730     protected $StemmingEnabled = TRUE;
00731     protected $SynonymsEnabled = TRUE;
00732 
00733     private $WordCountAdded;
00734     private $FieldIds;
00735     private $FieldInfo;
00736     private $RequiredTermCount;
00737     private $RequiredTermCounts;
00738     private $InclusiveTermCount;
00739     private $ExcludedTermCount;
00740     private $SearchTermList;
00741 
00742     const STEM_ID_OFFSET = 1000000;
00743 
00744 
00745     # ---- common private functions (used in both searching and DB build)
00746 
00747     # normalize and parse search string into list of search terms
00748     private function ParseSearchStringForWords($SearchString, $IgnorePhrases = FALSE)
00749     {
00750         # strip off any surrounding whitespace
00751         $Text = trim($SearchString);
00752 
00753         # set up normalization replacement strings
00754         $Patterns = array(
00755                 "/'s[^a-z0-9\\-+~]+/i", # get rid of possessive plurals
00756                 "/'/",                  # get rid of single quotes / apostrophes
00757                 "/\"[^\"]*\"/",         # get rid of phrases  (NOTE: HARD-CODED INDEX BELOW!!!)  "
00758                 "/\\([^)]*\\)/",        # get rid of groups  (NOTE: HARD-CODED INDEX BELOW!!!)
00759                 "/[^a-z0-9\\-+~]+/i",   # convert non-alphanumerics / non-minus/plus to a space
00760                 "/([^\\s])-+/i",        # convert minus preceded by anything but whitespace to a space
00761                 "/([^\\s])\\++/i",      # convert plus preceded by anything but whitespace to a space
00762                 "/-\\s/i",              # convert minus followed by whitespace to a space
00763                 "/\\+\\s/i",            # convert plus followed by whitespace to a space
00764                 "/~\\s/i",              # convert tilde followed by whitespace to a space
00765                 "/[ ]+/"                # convert multiple spaces to one space
00766                 );
00767         $Replacements = array(
00768                 " ",
00769                 "",
00770                 " ",
00771                 " ",
00772                 "\\1 ",
00773                 "\\1 ",
00774                 " ",
00775                 " ",
00776                 " ",
00777                 " ",
00778                 " "
00779                 );
00780 
00781         # if we are supposed to ignore phrases and groups (series of words in quotes or surrounded by parens)
00782         if ($IgnorePhrases)
00783         {
00784             # switch phrase removal to double quote removal  (HARD-CODED INDEX INTO PATTERN LIST!!)
00785             $Patterns[2] = "/\"/";
00786 
00787             # switch group removal to paren removal  (HARD-CODED INDEX INTO PATTERN LIST!!)
00788             $Patterns[3] = "/[\(\)]+/";
00789         }
00790 
00791         # remove punctuation from text and normalize whitespace
00792         $Text = preg_replace($Patterns, $Replacements, $Text);
00793         $this->DMsg(2, "Normalized search string is '".$Text."'");
00794 
00795         # convert text to lower case
00796         $Text = strtolower($Text);
00797 
00798         # strip off any extraneous whitespace
00799         $Text = trim($Text);
00800 
00801         # start with an empty array
00802         $Words = array();
00803 
00804         # if we have no words left after parsing
00805         if (strlen($Text) != 0)
00806         {
00807             # for each word
00808             foreach (explode(" ", $Text) as $Word)
00809             {
00810                 # grab first character of word
00811                 $FirstChar = substr($Word, 0, 1);
00812 
00813                 # strip off option characters and set flags appropriately
00814                 $Flags = WORD_PRESENT;
00815                 if ($FirstChar == "-")
00816                 {
00817                     $Word = substr($Word, 1);
00818                     $Flags |= WORD_EXCLUDED;
00819                     if (!isset($Words[$Word]))
00820                     {
00821                         $this->ExcludedTermCount++;
00822                     }
00823                 }
00824                 else
00825                 {
00826                     if ($FirstChar == "~")
00827                     {
00828                         $Word = substr($Word, 1);
00829                     }
00830                     elseif (($this->DefaultSearchLogic == self::LOGIC_AND)
00831                             || ($FirstChar == "+"))
00832                     {
00833                         if ($FirstChar == "+")
00834                         {
00835                             $Word = substr($Word, 1);
00836                         }
00837                         $Flags |= WORD_REQUIRED;
00838                         if (!isset($Words[$Word]))
00839                         {
00840                             $this->RequiredTermCount++;
00841                         }
00842                     }
00843                     if (!isset($Words[$Word]))
00844                     {
00845                         $this->InclusiveTermCount++;
00846                         $this->SearchTermList[] = $Word;
00847                     }
00848                 }
00849 
00850                 # store flags to indicate word found
00851                 $Words[$Word] = $Flags;
00852                 $this->DMsg(3, "Word identified (".$Word.")");
00853             }
00854         }
00855 
00856         # return normalized words to caller
00857         return $Words;
00858     }
00859 
00860     protected function GetFieldId($FieldName)
00861     {
00862         # if field ID is not in cache
00863         if (!isset($this->FieldIds[$FieldName]))
00864         {
00865             # look up field info in database
00866             $this->DB->Query("SELECT FieldId FROM SearchFields "
00867                     ."WHERE FieldName = '".addslashes($FieldName)."'");
00868 
00869             # if field was found
00870             if ($Record = $this->DB->FetchRow())
00871             {
00872                 # load info from DB record
00873                 $FieldId = $Record["FieldId"];
00874             }
00875             else
00876             {
00877                 # add field to database
00878                 $this->DB->Query("INSERT INTO SearchFields (FieldName) "
00879                         ."VALUES ('".addslashes($FieldName)."')");
00880 
00881                   # retrieve ID for newly added field
00882                 $FieldId = $this->DB->LastInsertId("SearchFields");
00883             }
00884 
00885             # cache field info
00886             $this->FieldIds[$FieldName] = $FieldId;
00887         }
00888 
00889         # return cached ID to caller
00890         return $this->FieldIds[$FieldName];
00891     }
00892 
00893     # retrieve ID for specified word (returns NULL if no ID found)
00894     private function GetWordId($Word, $AddIfNotFound = FALSE)
00895     {
00896         static $WordIdCache;
00897 
00898         # if word was in ID cache
00899         if (isset($WordIdCache[$Word]))
00900         {
00901             # use ID from cache
00902             $WordId = $WordIdCache[$Word];
00903         }
00904         else
00905         {
00906             # look up ID in database
00907             $WordId = $this->DB->Query("SELECT WordId"
00908                     ." FROM SearchWords"
00909                     ." WHERE WordText='".addslashes($Word)."'",
00910                     "WordId");
00911 
00912             # if ID was not found and caller requested it be added
00913             if (($WordId === NULL) && $AddIfNotFound)
00914             {
00915                 # add word to database
00916                 $this->DB->Query("INSERT INTO SearchWords (WordText)"
00917                         ." VALUES ('".addslashes(strtolower($Word))."')");
00918 
00919                 # get ID for newly added word
00920                 $WordId = $this->DB->LastInsertId("SearchWords");
00921             }
00922 
00923             # save ID to cache
00924             $WordIdCache[$Word] = $WordId;
00925         }
00926 
00927         # return ID to caller
00928         return $WordId;
00929     }
00930 
00931     # retrieve ID for specified word stem (returns NULL if no ID found)
00932     private function GetStemId($Stem, $AddIfNotFound = FALSE)
00933     {
00934         static $StemIdCache;
00935 
00936         # if stem was in ID cache
00937         if (isset($StemIdCache[$Stem]))
00938         {
00939             # use ID from cache
00940             $StemId = $StemIdCache[$Stem];
00941         }
00942         else
00943         {
00944             # look up ID in database
00945             $StemId = $this->DB->Query("SELECT WordId"
00946                     ." FROM SearchStems"
00947                     ." WHERE WordText='".addslashes($Stem)."'",
00948                     "WordId");
00949 
00950             # if ID was not found and caller requested it be added
00951             if (($StemId === NULL) && $AddIfNotFound)
00952             {
00953                 # add stem to database
00954                 $this->DB->Query("INSERT INTO SearchStems (WordText)"
00955                         ." VALUES ('".addslashes(strtolower($Stem))."')");
00956 
00957                 # get ID for newly added stem
00958                 $StemId = $this->DB->LastInsertId("SearchStems");
00959             }
00960 
00961             # adjust from DB ID value to stem ID value
00962             $StemId += self::STEM_ID_OFFSET;
00963 
00964             # save ID to cache
00965             $StemIdCache[$Stem] = $StemId;
00966         }
00967 
00968         # return ID to caller
00969         return $StemId;
00970     }
00971 
00972     # retrieve word for specified word ID (returns FALSE if no word found)
00973     private function GetWord($WordId)
00974     {
00975         static $WordCache;
00976 
00977         # if word was in cache
00978         if (isset($WordCache[$WordId]))
00979         {
00980             # use word from cache
00981             $Word = $WordCache[$WordId];
00982         }
00983         else
00984         {
00985             # adjust search location and word ID if word is stem
00986             $TableName = "SearchWords";
00987             if ($WordId >= self::STEM_ID_OFFSET)
00988             {
00989                 $TableName = "SearchStems";
00990                 $WordId -= self::STEM_ID_OFFSET;
00991             }
00992 
00993             # look up word in database
00994             $Word = $this->DB->Query("SELECT WordText"
00995                     ." FROM ".$TableName
00996                     ." WHERE WordId='".$WordId."'",
00997                     "WordText");
00998 
00999             # save word to cache
01000             $WordCache[$WordId] = $Word;
01001         }
01002 
01003         # return word to caller
01004         return $Word;
01005     }
01006 
01007 
01008     # ---- private functions used in searching
01009 
01010     # perform search across multiple fields and return raw results to caller
01011     private function SearchAcrossFields($SearchStrings)
01012     {
01013         # start by assuming no search will be done
01014         $Scores = NULL;
01015 
01016         # clear word counts
01017         $this->InclusiveTermCount = 0;
01018         $this->RequiredTermCount = 0;
01019         $this->ExcludedTermCount = 0;
01020 
01021         # for each field
01022         $NeedComparisonSearch = FALSE;
01023         foreach ($SearchStrings as $FieldName => $SearchStringArray)
01024         {
01025             # convert search string to array if needed
01026             if (!is_array($SearchStringArray))
01027             {
01028                 $SearchStringArray = array($SearchStringArray);
01029             }
01030 
01031             # for each search string for this field
01032             foreach ($SearchStringArray as $SearchString)
01033             {
01034                 # if field is keyword or field is text and does not look like comparison match
01035                 if (($FieldName == "XXXKeywordXXX")
01036                     || (isset($this->FieldInfo[$FieldName])
01037                         && ($this->FieldInfo[$FieldName]["FieldType"] == self::FIELDTYPE_TEXT)
01038                         && !preg_match("/^[><!]=./", $SearchString)
01039                         && !preg_match("/^[><=]./", $SearchString)))
01040                 {
01041                     $this->DMsg(0, "Searching text field \""
01042                             .$FieldName."\" for string \"$SearchString\"");
01043 
01044                     # normalize text and split into words
01045                     $Words[$FieldName] =
01046                             $this->ParseSearchStringForWords($SearchString);
01047 
01048                     # calculate scores for matching items
01049                     if (count($Words[$FieldName]))
01050                     {
01051                         $Scores = $this->SearchForWords(
01052                                 $Words[$FieldName], $FieldName, $Scores);
01053                         $this->DMsg(3, "Have "
01054                                 .count($Scores)." results after word search");
01055                     }
01056 
01057                     # split into phrases
01058                     $Phrases[$FieldName] =
01059                             $this->ParseSearchStringForPhrases($SearchString);
01060 
01061                     # handle any phrases
01062                     if (count($Phrases[$FieldName]))
01063                     {
01064                         $Scores = $this->SearchForPhrases(
01065                                 $Phrases[$FieldName], $Scores, $FieldName, TRUE, FALSE);
01066                         $this->DMsg(3, "Have "
01067                                 .count($Scores)." results after phrase search");
01068                     }
01069                 }
01070                 else
01071                 {
01072                     # set flag to indicate possible comparison search candidate found
01073                     $NeedComparisonSearch = TRUE;
01074                 }
01075             }
01076         }
01077 
01078         # perform comparison searches
01079         if ($NeedComparisonSearch)
01080         {
01081             $Scores = $this->SearchForComparisonMatches($SearchStrings, $Scores);
01082             $this->DMsg(3, "Have ".count($Scores)." results after comparison search");
01083         }
01084 
01085         # if no results found and exclusions specified
01086         if (!count($Scores) && $this->ExcludedTermCount)
01087         {
01088             # load all records
01089             $Scores = $this->LoadScoresForAllRecords();
01090         }
01091 
01092         # if search results found
01093         if (count($Scores))
01094         {
01095             # for each search text string
01096             foreach ($SearchStrings as $FieldName => $SearchStringArray)
01097             {
01098                 # convert search string to array if needed
01099                 if (!is_array($SearchStringArray))
01100                 {
01101                     $SearchStringArray = array($SearchStringArray);
01102                 }
01103 
01104                 # for each search string for this field
01105                 foreach ($SearchStringArray as $SearchString)
01106                 {
01107                     # if field is text
01108                     if (($FieldName == "XXXKeywordXXX")
01109                             || (isset($this->FieldInfo[$FieldName])
01110                                 && ($this->FieldInfo[$FieldName]["FieldType"]
01111                                         == self::FIELDTYPE_TEXT)))
01112                     {
01113                         # if there are words in search text
01114                         if (isset($Words[$FieldName]))
01115                         {
01116                             # handle any excluded words
01117                             $Scores = $this->FilterOnExcludedWords($Words[$FieldName], $Scores, $FieldName);
01118                         }
01119 
01120                         # handle any excluded phrases
01121                         if (isset($Phrases[$FieldName]))
01122                         {
01123                             $Scores = $this->SearchForPhrases(
01124                                     $Phrases[$FieldName], $Scores, $FieldName, FALSE, TRUE);
01125                         }
01126                     }
01127                 }
01128             }
01129 
01130             # strip off any results that don't contain required words
01131             $Scores = $this->FilterOnRequiredWords($Scores);
01132         }
01133 
01134         # return search result scores to caller
01135         return $Scores;
01136     }
01137 
01138     # search for words in specified field
01139     private function SearchForWords(
01140             $Words, $FieldName = "XXXKeywordXXX", $Scores = NULL)
01141     {
01142         $DB = $this->DB;
01143 
01144         # start with empty search result scores list if none passed in
01145         if ($Scores == NULL)
01146         {
01147             $Scores = array();
01148         }
01149 
01150         # grab field ID
01151         $FieldId = $this->GetFieldId($FieldName);
01152 
01153         # for each word
01154         foreach ($Words as $Word => $Flags)
01155         {
01156             $this->DMsg(2, "Searching for word '${Word}' in field ".$FieldName);
01157 
01158             # if word is not excluded
01159             if (!($Flags & WORD_EXCLUDED))
01160             {
01161                 # look up record ID for word
01162                 $this->DMsg(2, "Looking up word \"".$Word."\"");
01163                 $WordId = $this->GetWordId($Word);
01164 
01165                 # if word is in DB
01166                 if ($WordId !== NULL)
01167                 {
01168                     # look up counts for word
01169                     $DB->Query("SELECT ItemId,Count FROM SearchWordCounts "
01170                             ."WHERE WordId = ".$WordId
01171                             ." AND FieldId = ".$FieldId);
01172                     $Counts = $DB->FetchColumn("Count", "ItemId");
01173 
01174                     # if synonym support is enabled
01175                     if ($this->SynonymsEnabled)
01176                     {
01177                         # look for any synonyms
01178                         $DB->Query("SELECT WordIdA, WordIdB"
01179                                 ." FROM SearchWordSynonyms"
01180                                 ." WHERE WordIdA = ".$WordId
01181                                 ." OR WordIdB = ".$WordId);
01182 
01183                         # if synonyms were found
01184                         if ($DB->NumRowsSelected())
01185                         {
01186                             # retrieve synonym IDs
01187                             $SynonymIds = array();
01188                             while ($Record = $DB->FetchRow())
01189                             {
01190                                 $SynonymIds[] = ($Record["WordIdA"] == $WordId)
01191                                         ? $Record["WordIdB"]
01192                                         : $Record["WordIdA"];
01193                             }
01194 
01195                             # for each synonym
01196                             foreach ($SynonymIds as $SynonymId)
01197                             {
01198                                 # retrieve counts for synonym
01199                                 $DB->Query("SELECT ItemId,Count"
01200                                         ." FROM SearchWordCounts"
01201                                         ." WHERE WordId = ".$SynonymId
01202                                         ." AND FieldId = ".$FieldId);
01203                                 $SynonymCounts = $DB->FetchColumn("Count", "ItemId");
01204 
01205                                 # for each count
01206                                 foreach ($SynonymCounts as $ItemId => $Count)
01207                                 {
01208                                     # adjust count because it's a synonym
01209                                     $AdjustedCount = ceil($Count / 2);
01210 
01211                                     # add count to existing counts
01212                                     if (isset($Counts[$ItemId]))
01213                                     {
01214                                         $Counts[$ItemId] += $AdjustedCount;
01215                                     }
01216                                     else
01217                                     {
01218                                         $Counts[$ItemId] = $AdjustedCount;
01219                                     }
01220                                 }
01221                             }
01222                         }
01223                     }
01224                 }
01225 
01226                 # if stemming is enabled
01227                 if ($this->StemmingEnabled)
01228                 {
01229                     # retrieve stem ID
01230                     $Stem = PorterStemmer::Stem($Word);
01231                     $this->DMsg(2, "Looking up stem \"".$Stem."\"");
01232                     $StemId = $this->GetStemId($Stem);
01233 
01234                     # if ID found for stem
01235                     if ($StemId !== NULL)
01236                     {
01237                         # retrieve counts for stem
01238                         $DB->Query("SELECT ItemId,Count"
01239                                 ." FROM SearchWordCounts"
01240                                 ." WHERE WordId = ".$StemId
01241                                 ." AND FieldId = ".$FieldId);
01242                         $StemCounts = $DB->FetchColumn("Count", "ItemId");
01243 
01244                         # for each count
01245                         foreach ($StemCounts as $ItemId => $Count)
01246                         {
01247                             # adjust count because it's a stem
01248                             $AdjustedCount = ceil($Count / 2);
01249 
01250                             # add count to existing counts
01251                             if (isset($Counts[$ItemId]))
01252                             {
01253                                 $Counts[$ItemId] += $AdjustedCount;
01254                             }
01255                             else
01256                             {
01257                                 $Counts[$ItemId] = $AdjustedCount;
01258                             }
01259                         }
01260                     }
01261                 }
01262 
01263                 # if counts were found
01264                 if (isset($Counts))
01265                 {
01266                     # for each count
01267                     foreach ($Counts as $ItemId => $Count)
01268                     {
01269                         # if word flagged as required
01270                         if ($Flags & WORD_REQUIRED)
01271                         {
01272                             # increment required word count for record
01273                             if (isset($this->RequiredTermCounts[$ItemId]))
01274                             {
01275                                 $this->RequiredTermCounts[$ItemId]++;
01276                             }
01277                             else
01278                             {
01279                                 $this->RequiredTermCounts[$ItemId] = 1;
01280                             }
01281                         }
01282 
01283                         # add to item record score
01284                         if (isset($Scores[$ItemId]))
01285                         {
01286                             $Scores[$ItemId] += $Count;
01287                         }
01288                         else
01289                         {
01290                             $Scores[$ItemId] = $Count;
01291                         }
01292                     }
01293                 }
01294             }
01295         }
01296 
01297         # return basic scores to caller
01298         return $Scores;
01299     }
01300 
01301     # extract phrases (terms surrounded by quotes) from search string
01302     private function ParseSearchStringForPhrases($SearchString)
01303     {
01304         # split into chunks delimited by double quote marks
01305         $Pieces = explode("\"", $SearchString);   # "
01306 
01307         # for each pair of chunks
01308         $Index = 2;
01309         $Phrases = array();
01310         while ($Index < count($Pieces))
01311         {
01312             # grab phrase from chunk
01313             $Phrase = trim(addslashes($Pieces[$Index - 1]));
01314             $Flags = WORD_PRESENT;
01315 
01316             # grab first character of phrase
01317             $FirstChar = substr($Pieces[$Index - 2], -1);
01318 
01319             # set flags to reflect any option characters
01320             if ($FirstChar == "-")
01321             {
01322                 $Flags |= WORD_EXCLUDED;
01323                 if (!isset($Phrases[$Phrase]))
01324                 {
01325                     $this->ExcludedTermCount++;
01326                 }
01327             }
01328             else
01329             {
01330                 if ((($this->DefaultSearchLogic == self::LOGIC_AND) && ($FirstChar != "~"))
01331                         || ($FirstChar == "+"))
01332                 {
01333                     $Flags |= WORD_REQUIRED;
01334                     if (!isset($Phrases[$Phrase]))
01335                     {
01336                         $this->RequiredTermCount++;
01337                     }
01338                 }
01339                 if (!isset($Phrases[$Phrase]))
01340                 {
01341                     $this->InclusiveTermCount++;
01342                     $this->SearchTermList[] = $Phrase;
01343                 }
01344             }
01345             $Phrases[$Phrase] = $Flags;
01346 
01347             # move to next pair of chunks
01348             $Index += 2;
01349         }
01350 
01351         # return phrases to caller
01352         return $Phrases;
01353     }
01354 
01355     # extract groups (terms surrounded by parens) from search string
01356     # (NOTE: NOT YET IMPLEMENTED!!!)
01357     private function ParseSearchStringForGroups($SearchString)
01358     {
01359         # split into chunks delimited by open paren
01360         $Pieces = explode("(", $SearchString);
01361 
01362         # for each chunk
01363         $Index = 2;
01364         while ($Index < count($Pieces))
01365         {
01366             # grab phrase from chunk
01367             $Group = trim(addslashes($Pieces[$Index - 1]));
01368             $Groups[] = $Group;
01369 
01370             # move to next pair of chunks
01371             $Index += 2;
01372         }
01373 
01374         # return phrases to caller
01375         return $Groups;
01376     }
01377 
01378     protected function SearchFieldForPhrases($FieldName, $Phrase)
01379     {
01380         # error out
01381         exit("<br>SE - ERROR:  SearchFieldForPhrases() not implemented<br>\n");
01382     }
01383 
01384     private function SearchForPhrases($Phrases, $Scores, $FieldName = "XXXKeywordXXX",
01385             $ProcessNonExcluded = TRUE, $ProcessExcluded = TRUE)
01386     {
01387         # if phrases are found
01388         if (count($Phrases) > 0)
01389         {
01390             # if this is a keyword search
01391             if ($FieldName == "XXXKeywordXXX")
01392             {
01393                 # for each field
01394                 foreach ($this->FieldInfo as $KFieldName => $Info)
01395                 {
01396                     # if field is marked to be included in keyword searches
01397                     if ($Info["InKeywordSearch"])
01398                     {
01399                         # call ourself with that field
01400                         $Scores = $this->SearchForPhrases($Phrases, $Scores, $KFieldName,
01401                                                           $ProcessNonExcluded, $ProcessExcluded);
01402                     }
01403                 }
01404             }
01405             else
01406             {
01407                 # for each phrase
01408                 foreach ($Phrases as $Phrase => $Flags)
01409                 {
01410                     $this->DMsg(2, "Searching for phrase '".$Phrase
01411                             ."' in field ".$FieldName);
01412 
01413                     # if phrase flagged as excluded and we are doing excluded phrases
01414                     #         or phrase flagged as non-excluded and we are doing non-excluded phrases
01415                     if (($ProcessExcluded && ($Flags & WORD_EXCLUDED))
01416                             || ($ProcessNonExcluded && !($Flags & WORD_EXCLUDED)))
01417                     {
01418                         # initialize score list if necessary
01419                         if ($Scores === NULL) {  $Scores = array();  }
01420 
01421                         # retrieve list of items that contain phrase
01422                         $ItemIds = $this->SearchFieldForPhrases(
01423                                 $FieldName, $Phrase);
01424 
01425                         # for each item that contains phrase
01426                         foreach ($ItemIds as $ItemId)
01427                         {
01428                             # if we are doing excluded phrases and phrase flagged as excluded
01429                             if ($ProcessExcluded && ($Flags & WORD_EXCLUDED))
01430                             {
01431                                 # knock item off of list
01432                                 unset($Scores[$ItemId]);
01433                             }
01434                             elseif ($ProcessNonExcluded)
01435                             {
01436                                 # calculate phrase value based on number of words and field weight
01437                                 $PhraseScore = count(preg_split("/[\s]+/", $Phrase, -1, PREG_SPLIT_NO_EMPTY))
01438                                                        * $this->FieldInfo[$FieldName]["Weight"];
01439                                 $this->DMsg(2, "Phrase score is ".$PhraseScore);
01440 
01441                                 # bump up item record score
01442                                 if (isset($Scores[$ItemId]))
01443                                 {
01444                                     $Scores[$ItemId] += $PhraseScore;
01445                                 }
01446                                 else
01447                                 {
01448                                     $Scores[$ItemId] = $PhraseScore;
01449                                 }
01450 
01451                                 # if phrase flagged as required
01452                                 if ($Flags & WORD_REQUIRED)
01453                                 {
01454                                     # increment required word count for record
01455                                     if (isset($this->RequiredTermCounts[$ItemId]))
01456                                     {
01457                                         $this->RequiredTermCounts[$ItemId]++;
01458                                     }
01459                                     else
01460                                     {
01461                                         $this->RequiredTermCounts[$ItemId] = 1;
01462                                     }
01463                                 }
01464                             }
01465                         }
01466                     }
01467                 }
01468             }
01469         }
01470 
01471         # return updated scores to caller
01472         return $Scores;
01473     }
01474 
01475     private function FilterOnExcludedWords($Words, $Scores, $FieldName = "XXXKeywordXXX")
01476     {
01477         $DB = $this->DB;
01478 
01479         # grab field ID
01480         $FieldId = $this->GetFieldId($FieldName);
01481 
01482         # for each word
01483         foreach ($Words as $Word => $Flags)
01484         {
01485             # if word flagged as excluded
01486             if ($Flags & WORD_EXCLUDED)
01487             {
01488                 # look up record ID for word
01489                 $WordId = $this->GetWordId($Word);
01490 
01491                 # if word is in DB
01492                 if ($WordId !== NULL)
01493                 {
01494                     # look up counts for word
01495                     $DB->Query("SELECT ItemId FROM SearchWordCounts "
01496                             ."WHERE WordId=${WordId} AND FieldId=${FieldId}");
01497 
01498                     # for each count
01499                     while ($Record = $DB->FetchRow())
01500                     {
01501                         # if item record is in score list
01502                         $ItemId = $Record["ItemId"];
01503                         if (isset($Scores[$ItemId]))
01504                         {
01505                             # remove item record from score list
01506                             $this->DMsg(3, "Filtering out item ".$ItemId
01507                                     ." because it contained word \"".$Word."\"");
01508                             unset($Scores[$ItemId]);
01509                         }
01510                     }
01511                 }
01512             }
01513         }
01514 
01515         # returned filtered score list to caller
01516         return $Scores;
01517     }
01518 
01519     private function FilterOnRequiredWords($Scores)
01520     {
01521         # if there were required words
01522         if ($this->RequiredTermCount > 0)
01523         {
01524             # for each item
01525             foreach ($Scores as $ItemId => $Score)
01526             {
01527                 # if item does not meet required word count
01528                 if (!isset($this->RequiredTermCounts[$ItemId])
01529                         || ($this->RequiredTermCounts[$ItemId] < $this->RequiredTermCount))
01530                 {
01531                     # filter out item
01532                     $this->DMsg(4, "Filtering out item ".$ItemId
01533                             ." because it didn't have required word count of "
01534                             .$this->RequiredTermCount
01535                             .(isset($this->RequiredTermCounts[$ItemId])
01536                                     ? " (only had "
01537                                     .$this->RequiredTermCounts[$ItemId]
01538                                     : " (had none")
01539                             .")");
01540                     unset($Scores[$ItemId]);
01541                 }
01542             }
01543         }
01544 
01545         # return filtered list to caller
01546         return $Scores;
01547     }
01548 
01549     # count, sort, and trim search result scores list
01550     private function CleanScores($Scores, $StartingResult, $NumberOfResults,
01551             $SortByField, $SortDescending)
01552     {
01553         # perform any requested filtering
01554         $this->DMsg(0, "Have ".count($Scores)." results before filter callbacks");
01555         $Scores = $this->FilterOnSuppliedFunctions($Scores);
01556 
01557         # save total number of results available
01558         $this->NumberOfResultsAvailable = count($Scores);
01559 
01560         # if no sorting field specified
01561         if ($SortByField === NULL)
01562         {
01563             # sort result list by score
01564             if ($SortDescending)
01565                 arsort($Scores, SORT_NUMERIC);
01566             else
01567                 asort($Scores, SORT_NUMERIC);
01568         }
01569         else
01570         {
01571             # get list of item IDs in sorted order
01572             $SortedIds = $this->GetItemIdsSortedByField(
01573                     $SortByField, $SortDescending);
01574 
01575             # if we have sorted item IDs
01576             if (count($SortedIds) && count($Scores))
01577             {
01578                 # strip sorted ID list down to those that appear in search results
01579                 $SortedIds = array_intersect($SortedIds, array_keys($Scores));
01580 
01581                 # rebuild score list in sorted order
01582                 foreach ($SortedIds as $Id)
01583                 {
01584                     $NewScores[$Id] = $Scores[$Id];
01585                 }
01586                 $Scores = $NewScores;
01587             }
01588             else
01589             {
01590                 # sort result list by score
01591                 arsort($Scores, SORT_NUMERIC);
01592             }
01593         }
01594 
01595         # trim result list to match range requested by caller
01596         $ScoresKeys = array_slice(
01597                 array_keys($Scores), $StartingResult, $NumberOfResults);
01598         $TrimmedScores = array();
01599         foreach ($ScoresKeys as $Key) {  $TrimmedScores[$Key] = $Scores[$Key];  }
01600 
01601         # returned cleaned search result scores list to caller
01602         return $TrimmedScores;
01603     }
01604 
01605     protected function FilterOnSuppliedFunctions($Scores)
01606     {
01607         # if filter functions have been set
01608         if (isset($this->FilterFuncs))
01609         {
01610             # for each result
01611             foreach ($Scores as $ItemId => $Score)
01612             {
01613                 # for each filter function
01614                 foreach ($this->FilterFuncs as $FuncName)
01615                 {
01616                     # if filter function return TRUE for item
01617                     if ($FuncName($ItemId))
01618                     {
01619                         # discard result
01620                         $this->DMsg(2, "Filter callback <i>".$FuncName
01621                                 ."</i> rejected item ".$ItemId);
01622                         unset($Scores[$ItemId]);
01623 
01624                         # bail out of filter func loop
01625                         continue 2;
01626                     }
01627                 }
01628             }
01629         }
01630 
01631         # return filtered list to caller
01632         return $Scores;
01633     }
01634 
01635     private function SearchForComparisonMatches($SearchStrings, $Scores)
01636     {
01637         # for each field
01638         $Index = 0;
01639         foreach ($SearchStrings as $SearchFieldName => $SearchStringArray)
01640         {
01641             # if field is not keyword
01642             if ($SearchFieldName != "XXXKeywordXXX")
01643             {
01644                 # convert search string to array if needed
01645                 if (!is_array($SearchStringArray))
01646                 {
01647                     $SearchStringArray = array($SearchStringArray);
01648                 }
01649 
01650                 # for each search string for this field
01651                 foreach ($SearchStringArray as $SearchString)
01652                 {
01653                     # if search string looks like comparison search
01654                     $FoundOperator = preg_match("/^[><!]=./", $SearchString)
01655                             || preg_match("/^[><=]./", $SearchString);
01656                     if ($FoundOperator
01657                             || (isset($this->FieldInfo[$SearchFieldName]["FieldType"])
01658                             && ($this->FieldInfo[$SearchFieldName]["FieldType"]
01659                                     != self::FIELDTYPE_TEXT)))
01660                     {
01661                         # determine value
01662                         $Patterns = array("/^[><!]=/", "/^[><=]/");
01663                         $Replacements = array("", "");
01664                         $Value = trim(preg_replace($Patterns, $Replacements, $SearchString));
01665 
01666                         # determine and save operator
01667                         if (!$FoundOperator)
01668                         {
01669                             $Operators[$Index] = "=";
01670                         }
01671                         else
01672                         {
01673                             $Term = trim($SearchString);
01674                             $FirstChar = $Term{0};
01675                             $FirstTwoChars = $FirstChar.$Term{1};
01676                             if ($FirstTwoChars == ">=")     {  $Operators[$Index] = ">=";  }
01677                             elseif ($FirstTwoChars == "<=") {  $Operators[$Index] = "<=";  }
01678                             elseif ($FirstTwoChars == "!=") {  $Operators[$Index] = "!=";  }
01679                             elseif ($FirstChar == ">")      {  $Operators[$Index] = ">";  }
01680                             elseif ($FirstChar == "<")      {  $Operators[$Index] = "<";  }
01681                             elseif ($FirstChar == "=")      {  $Operators[$Index] = "=";  }
01682                         }
01683 
01684                         # if operator was found
01685                         if (isset($Operators[$Index]))
01686                         {
01687                             # save value
01688                             $Values[$Index] = $Value;
01689 
01690                             # save field name
01691                             $FieldNames[$Index] = $SearchFieldName;
01692                             $this->DMsg(3, "Added comparison (field = <i>"
01693                                     .$FieldNames[$Index]."</i>  op = <i>"
01694                                     .$Operators[$Index]."</i>  val = <i>"
01695                                     .$Values[$Index]."</i>)");
01696 
01697                             # move to next comparison array entry
01698                             $Index++;
01699                         }
01700                     }
01701                 }
01702             }
01703         }
01704 
01705         # if comparisons found
01706         if (isset($Operators))
01707         {
01708             # perform comparisons on fields and gather results
01709             $Results = $this->SearchFieldsForComparisonMatches($FieldNames, $Operators, $Values);
01710 
01711             # if search logic is set to AND
01712             if ($this->DefaultSearchLogic == self::LOGIC_AND)
01713             {
01714                 # if results were found
01715                 if (count($Results))
01716                 {
01717                     # if there were no prior results and no terms for keyword search
01718                     if ((count($Scores) == 0) && ($this->InclusiveTermCount == 0))
01719                     {
01720                         # add all results to scores
01721                         foreach ($Results as $ItemId)
01722                         {
01723                             $Scores[$ItemId] = 1;
01724                         }
01725                     }
01726                     else
01727                     {
01728                         # remove anything from scores that is not part of results
01729                         foreach ($Scores as $ItemId => $Score)
01730                         {
01731                             if (in_array($ItemId, $Results) == FALSE)
01732                             {
01733                                 unset($Scores[$ItemId]);
01734                             }
01735                         }
01736                     }
01737                 }
01738                 else
01739                 {
01740                     # clear scores
01741                     $Scores = array();
01742                 }
01743             }
01744             else
01745             {
01746                 # add result items to scores
01747                 if ($Scores === NULL) {  $Scores = array();  }
01748                 foreach ($Results as $ItemId)
01749                 {
01750                     if (isset($Scores[$ItemId]))
01751                     {
01752                         $Scores[$ItemId] += 1;
01753                     }
01754                     else
01755                     {
01756                         $Scores[$ItemId] = 1;
01757                     }
01758                 }
01759             }
01760         }
01761 
01762         # return results to caller
01763         return $Scores;
01764     }
01765 
01766     private function SetDebugLevel($SearchStrings)
01767     {
01768         # if search info is an array
01769         if (is_array($SearchStrings))
01770         {
01771             # for each array element
01772             foreach ($SearchStrings as $FieldName => $SearchStringArray)
01773             {
01774                 # if element is an array
01775                 if (is_array($SearchStringArray))
01776                 {
01777                     # for each array element
01778                     foreach ($SearchStringArray as $Index => $SearchString)
01779                     {
01780                         # pull out search string if present
01781                         $SearchStrings[$FieldName][$Index] = $this->ExtractDebugLevel($SearchString);
01782                     }
01783                 }
01784                 else
01785                 {
01786                     # pull out search string if present
01787                     $SearchStrings[$FieldName] = $this->ExtractDebugLevel($SearchStringArray);
01788                 }
01789             }
01790         }
01791         else
01792         {
01793             # pull out search string if present
01794             $SearchStrings = $this->ExtractDebugLevel($SearchStrings);
01795         }
01796 
01797         # return new search info to caller
01798         return $SearchStrings;
01799     }
01800 
01801     private function ExtractDebugLevel($SearchString)
01802     {
01803         # if search string contains debug level indicator
01804         if (strstr($SearchString, "DBUGLVL="))
01805         {
01806             # remove indicator and set debug level
01807             $Level = preg_replace("/^\\s*DBUGLVL=([1-9]{1,2}).*/", "\\1", $SearchString);
01808             if ($Level > 0)
01809             {
01810                 $this->DebugLevel = $Level;
01811                 $this->DMsg(0, "Setting debug level to ".$Level);
01812                 $SearchString = preg_replace("/DBUGLVL=${Level}/", "", $SearchString);
01813             }
01814         }
01815 
01816         # return (possibly) modified search string to caller
01817         return $SearchString;
01818     }
01819 
01820     # load and return search result scores array containing all possible records
01821     private function LoadScoresForAllRecords()
01822     {
01823         # start with empty list
01824         $Scores = array();
01825 
01826         # for every item
01827         $this->DB->Query("SELECT ".$this->ItemIdFieldName
01828                          ." FROM ".$this->ItemTableName);
01829         while ($Record = $this->DB->FetchRow())
01830         {
01831             # set score for item to 1
01832             $Scores[$Record[$this->ItemIdFieldName]] = 1;
01833         }
01834 
01835         # return array with all scores to caller
01836         return $Scores;
01837     }
01838 
01839 
01840     # ---- private functions used in building search database
01841 
01849     private function UpdateWordCount($Word, $ItemId, $FieldId, $Weight = 1)
01850     {
01851         # retrieve ID for word
01852         $WordIds[] = $this->GetWordId($Word, TRUE);
01853 
01854         # if stemming is enabled
01855         if ($this->StemmingEnabled)
01856         {
01857             # retrieve ID for stem of word
01858             $Stem = PorterStemmer::Stem($Word, TRUE);
01859             $WordIds[] = $this->GetStemId($Stem, TRUE);
01860         }
01861 
01862         # for word and stem of word
01863         foreach ($WordIds as $WordId)
01864         {
01865             # if word count already added to database
01866             if (isset($this->WordCountAdded[$WordId][$FieldId]))
01867             {
01868                 # update word count
01869                 $this->DB->Query("UPDATE SearchWordCounts SET Count=Count+".$Weight
01870                         ." WHERE WordId=".$WordId
01871                                 ." AND ItemId=".$ItemId
01872                                 ." AND FieldId=".$FieldId);
01873             }
01874             else
01875             {
01876                 # add word count to DB
01877                 $this->DB->Query("INSERT INTO SearchWordCounts"
01878                         ." (WordId, ItemId, FieldId, Count) VALUES"
01879                         ." (".$WordId.", ".$ItemId.", ".$FieldId.", ".$Weight.")");
01880 
01881                 # remember that we added count for this word
01882                 $this->WordCountAdded[$WordId][$FieldId] = TRUE;
01883             }
01884 
01885             # decrease weight for stem
01886             $Weight = ceil($Weight / 2);
01887         }
01888     }
01889 
01890     protected function GetFieldContent($ItemId, $FieldName)
01891     {
01892         # error out
01893         exit("<br>SE - ERROR: GetFieldContent() not implemented<br>\n");
01894     }
01895 
01896     private function RecordSearchInfoForText(
01897             $ItemId, $FieldName, $Weight, $Text, $IncludeInKeyword)
01898     {
01899         # normalize text
01900         $Words = $this->ParseSearchStringForWords($Text, TRUE);
01901 
01902         # if there was text left after parsing
01903         if (count($Words) > 0)
01904         {
01905             # get ID for field
01906             $FieldId = $this->GetFieldId($FieldName);
01907 
01908             # if text should be included in keyword searches
01909             if ($IncludeInKeyword)
01910             {
01911                 # get ID for keyword field
01912                 $KeywordFieldId = $this->GetFieldId("XXXKeywordXXX");
01913             }
01914 
01915             # for each word
01916             foreach ($Words as $Word => $Flags)
01917             {
01918                 # update count for word
01919                 $this->UpdateWordCount($Word, $ItemId, $FieldId);
01920 
01921                 # if text should be included in keyword searches
01922                 if ($IncludeInKeyword)
01923                 {
01924                     # update keyword field count for word
01925                     $this->UpdateWordCount(
01926                             $Word, $ItemId, $KeywordFieldId, $Weight);
01927                 }
01928             }
01929         }
01930     }
01931 
01932     # print debug message if level set high enough
01933     protected function DMsg($Level, $Msg)
01934     {
01935         if ($this->DebugLevel > $Level)
01936         {
01937             print("SE:  ".$Msg."<br>\n");
01938         }
01939     }
01940 
01941     # ---- BACKWARD COMPATIBILITY --------------------------------------------
01942 
01943     # possible types of logical operators
01944     const SEARCHLOGIC_AND = 1;
01945     const SEARCHLOGIC_OR = 2;
01946 }
01947 
01948 ?>

CWIS logo doxygen
Copyright 2010 Internet Scout