3 # FILE: Recommender.php 5 # Part of the Collection Workflow Integration System (CWIS) 6 # Copyright 2004-2017 Edward Almasy and Internet Scout Research Group 7 # http://scout.wisc.edu/cwis/ 16 # ---- PUBLIC INTERFACE -------------------------------------------------- 17 # define content field types 41 public function __construct(&$DB, $ItemTableName, $RatingTableName,
42 $ItemIdFieldName, $UserIdFieldName, $RatingFieldName,
45 # set default parameters 46 $this->ContentCorrelationThreshold = 1;
48 # save database object 51 # save new configuration values 52 $this->ItemTableName = $ItemTableName;
53 $this->RatingTableName = $RatingTableName;
54 $this->ItemIdFieldName = $ItemIdFieldName;
55 $this->UserIdFieldName = $UserIdFieldName;
56 $this->RatingFieldName = $RatingFieldName;
57 $this->ContentFields = $ContentFields;
59 # set default debug state 73 # ---- recommendation methods 83 public function Recommend($UserId, $StartingResult = 0, $NumberOfResults = 10)
87 print
"REC: Recommend(${UserId}, ${StartingResult}," 88 .
" ${NumberOfResults})<br>\n";
91 # load in user ratings 94 $DB->Query(
"SELECT ".$this->ItemIdFieldName.
", ".$this->RatingFieldName
95 .
" FROM ".$this->RatingTableName
96 .
" WHERE ".$this->UserIdFieldName.
" = ${UserId}");
97 while ($Row = $DB->FetchRow())
99 $Ratings[$Row[$this->ItemIdFieldName]] =
100 $Row[$this->RatingFieldName];
104 print
"REC: user has rated ".count($Ratings).
" items<br>\n";
107 # for each item that user has rated 109 foreach ($Ratings as $ItemId => $ItemRating)
111 # for each content correlation available for that item 112 $DB->Query(
"SELECT Correlation, ItemIdB " 113 .
"FROM RecContentCorrelations " 114 .
"WHERE ItemIdA = ${ItemId}");
115 while ($Row = $DB->FetchRow())
117 # multiply that correlation by normalized rating and add 118 # resulting value to recommendation value for that item 119 if (isset($RecVals[$Row[
"ItemIdB"]]))
121 $RecVals[$Row[
"ItemIdB"]] +=
122 $Row[
"Correlation"] * ($ItemRating - 50);
126 $RecVals[$Row[
"ItemIdB"]] =
127 $Row[
"Correlation"] * ($ItemRating - 50);
131 print
"REC: RecVal[".$Row[
"ItemIdB"].
"] = " 132 .$RecVals[$Row[
"ItemIdB"]].
"<br>\n";
138 print
"REC: found ".count($RecVals).
" total recommendations<br>\n";
141 # calculate average correlation between items 142 $ResultThreshold = $DB->Query(
"SELECT AVG(Correlation) " 143 .
"AS Average FROM RecContentCorrelations",
"Average");
144 $ResultThreshold = round($ResultThreshold) * 2;
146 # for each recommended item 147 foreach ($RecVals as $ItemId => $RecVal)
149 # remove item from list if user already rated it 150 if (isset($Ratings[$ItemId]))
152 unset($RecVals[$ItemId]);
156 # scale recommendation value back to match thresholds 157 $RecVals[$ItemId] = round($RecVal / 50);
159 # remove item from recommendation list if value is below threshold 160 if ($RecVals[$ItemId] < $ResultThreshold)
162 unset($RecVals[$ItemId]);
168 print
"REC: found ".count($RecVals).
" positive recommendations<br>\n";
171 # sort recommendation list by value 172 if (isset($RecVals)) { arsort($RecVals, SORT_NUMERIC); }
174 # save total number of results available 175 $this->NumberOfResultsAvailable = count($RecVals);
177 # trim result list to match range requested by caller 178 $RecValKeys = array_slice(
179 array_keys($RecVals), $StartingResult, $NumberOfResults);
180 $RecValSegment = array();
181 foreach ($RecValKeys as $Key)
183 $RecValSegment[$Key] = $RecVals[$Key];
186 # return recommendation list to caller 187 return $RecValSegment;
197 # save filter function name 198 $this->FilterFuncs[] = $FunctionName;
207 return $this->NumberOfResultsAvailable;
216 return $this->LastSearchTime;
230 # pull list of correlations from DB 231 $this->DB->Query(
"SELECT * FROM RecContentCorrelations, ".$this->RatingTableName
232 .
" WHERE (ItemIdA = ${RecommendedItemId}" 233 .
" OR ItemIdB = ${RecommendedItemId})" 234 .
" AND ".$this->UserIdFieldName.
" = ".$UserId
235 .
" AND (RecContentCorrelations.ItemIdA = " 236 .$this->RatingTableName.
".".$this->ItemIdFieldName
237 .
" OR RecContentCorrelations.ItemIdB = " 238 .$this->RatingTableName.
".".$this->ItemIdFieldName.
")" 239 .
" AND Rating >= 50 " 240 .
" ORDER BY Correlation DESC");
242 # for each correlation 243 $SourceList = array();
244 while ($Row = $this->DB->FetchRow())
246 # pick out appropriate item ID 247 if ($Row[
"ItemIdA"] == $RecommendedItemId)
249 $ItemId = $Row[
"ItemIdB"];
253 $ItemId = $Row[
"ItemIdA"];
256 # add item to recommendation source list 257 $SourceList[$ItemId] = $Row[
"Correlation"];
260 # return recommendation source list to caller 275 print
"REC: searching for items similar to item \"" 279 # make sure we have item IDs available 282 # start with empty array 283 $SimilarItems = array();
286 foreach ($this->ItemIds as $Id)
288 # if item is not specified item 291 # calculate correlation of item to specified item 293 $ItemId, $Id, $FieldList);
295 # if correlation is above threshold 296 if ($Correlation > $this->ContentCorrelationThreshold)
298 # add item to list of similar items 299 $SimilarItems[$Id] = $Correlation;
305 print
"REC: ".count($SimilarItems).
" similar items to item \"" 306 .$ItemId.
"\" found<br>\n";
309 # filter list of similar items (if any) 310 if (count($SimilarItems) > 0)
315 print
"REC: ".count($SimilarItems).
" similar items to item \"" 316 .$ItemId.
"\" left after filtering<br>\n";
320 # if any similar items left 321 if (count($SimilarItems) > 0)
323 # sort list of similar items in order of most to least similar 324 arsort($SimilarItems, SORT_NUMERIC);
327 # return list of similar items to caller 328 return $SimilarItems;
343 print
"REC: generating field value recommendations for item \"" 347 # start with empty array of values 350 # generate list of similar items 353 # if similar items found 354 if (count($SimilarItems) > 0)
356 # prune list of similar items to only top third of better-than-average 357 $AverageCorr = intval(array_sum($SimilarItems) / count($SimilarItems));
358 reset($SimilarItems);
359 $HighestCorr = current($SimilarItems);
360 $CorrThreshold = intval($HighestCorr - (($HighestCorr - $AverageCorr) / 3));
363 print
"REC: <i>Average Correlation: $AverageCorr" 364 .
" Highest Correlation:" 365 .
" $HighestCorr Correlation" 366 .
" Threshold: $CorrThreshold </i><br>\n";
368 foreach ($SimilarItems as $ItemId => $ItemCorr)
370 if ($ItemCorr < $CorrThreshold)
372 unset($SimilarItems[$ItemId]);
377 print
"REC: ".count($SimilarItems)
378 .
" similar items left after threshold pruning<br>\n";
382 foreach ($SimilarItems as $SimItemId => $SimItemCorr)
385 foreach ($this->ContentFields as $FieldName => $FieldAttributes)
387 # load field data for this item 388 $FieldData = $this->GetFieldValue($SimItemId, $FieldName);
390 # if field data is array 391 if (is_array($FieldData))
393 # for each field data value 394 foreach ($FieldData as $FieldDataVal)
396 # if data value is not empty 397 $FieldDataVal = trim($FieldDataVal);
398 if (strlen($FieldDataVal) > 0)
400 # increment count for data value 401 $RecVals[$FieldName][$FieldDataVal]++;
407 # if data value is not empty 408 $FieldData = trim($FieldData);
409 if (strlen($FieldData) > 0)
411 # increment count for data value 412 $RecVals[$FieldName][$FieldData]++;
419 $MatchingCountThreshold = 3;
420 foreach ($RecVals as $FieldName => $FieldVals)
422 # determine cutoff threshold 423 arsort($FieldVals, SORT_NUMERIC);
425 $HighestCount = current($FieldVals);
426 $AverageCount = intval(array_sum($FieldVals) / count($FieldVals));
427 $CountThreshold = intval($AverageCount
428 + (($HighestCount - $AverageCount) / 2));
429 if ($CountThreshold < $MatchingCountThreshold)
431 $CountThreshold = $MatchingCountThreshold;
435 print
"REC: <i>Field: $FieldName " 436 .
" Average Count: $AverageCount " 437 .
" Highest Count: $HighestCount " 438 .
" Count Threshold: $CountThreshold </i><br>\n";
441 # for each field data value 442 foreach ($FieldVals as $FieldVal => $FieldValCount)
444 # if value count is below threshold 445 if ($FieldValCount < $CountThreshold)
448 unset($RecVals[$FieldName][$FieldVal]);
454 print
"REC: found ".count($RecVals[$FieldName])
455 .
" recommended values for field \"" 456 .$FieldName.
"\" after threshold pruning<br>\n";
461 # return recommended values to caller 466 # ---- database update methods 478 print
"REC: UpdateForItems(${StartingItemId}," 479 .
" ${NumberOfItems})<br>\n";
481 # make sure we have item IDs available 487 foreach ($this->ItemIds as $ItemId)
489 # if item ID is within requested range 490 if ($ItemId >= $StartingItemId)
492 # update recommender info for item 494 { print(
"REC: doing item ${ItemId}<br>\n"); }
498 # if we have done requested number of items 499 if ($ItemsUpdated >= $NumberOfItems)
504 print
"REC: bailing out with item ${ItemId}<br>\n";
511 # return ID of last item updated to caller 525 print
"REC: updating for item \"".$ItemId.
"\"<br>\n";
528 # make sure we have item IDs available 531 # clear existing correlations for this item 532 $this->DB->Query(
"DELETE FROM RecContentCorrelations " 533 .
"WHERE ItemIdA = ${ItemId}");
536 foreach ($this->ItemIds as $Id)
538 # if full pass and item is later in list than current item 539 if (($FullPass == FALSE) || ($Id > $ItemId))
541 # update correlation value for item and target item 553 # drop all correlation entries referring to item 554 $this->DB->Query(
"DELETE FROM RecContentCorrelations " 555 .
"WHERE ItemIdA = ".$ItemId.
" " 556 .
"OR ItemIdB = ".$ItemId);
564 # get average correlation 565 $AverageCorrelation = $this->DB->Query(
"SELECT AVG(Correlation) " 566 .
"AS Average FROM RecContentCorrelations",
"Average");
568 # dump all below-average correlations 569 if ($AverageCorrelation > 0)
571 $this->DB->Query(
"DELETE FROM RecContentCorrelations " 572 .
"WHERE Correlation <= ${AverageCorrelation}");
582 if (self::$ItemIdCache === NULL)
584 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM " 585 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
586 self::$ItemIdCache = $this->DB->FetchColumn(
"Id");
588 return self::$ItemIdCache;
597 self::$CorrelationCache = NULL;
598 self::$ItemIdCache = NULL;
599 self::$ItemDataCache = NULL;
603 # ---- PRIVATE INTERFACE ------------------------------------------------- 605 private $ContentCorrelationThreshold;
606 private $ContentFields;
607 private $ItemTableName;
608 private $RatingTableName;
609 private $ItemIdFieldName;
610 private $UserIdFieldName;
611 private $RatingFieldName;
614 private $FilterFuncs;
615 private $LastSearchTime;
616 private $NumberOfResultsAvailable;
619 static private $ItemIdCache = NULL;
620 static private $ItemDataCache = NULL;
621 static private $CorrelationCache = NULL;
628 # if item IDs not already loaded 629 if (!isset($this->ItemIds))
631 # load item IDs from DB 632 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM " 633 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
634 $this->ItemIds[] = $this->DB->FetchColumn(
"Id");
646 # if data not already loaded 647 if (!isset(self::$ItemDataCache[$ItemId][$FieldName]))
649 # load field value from DB 650 $FieldValue = $this->GetFieldValue($ItemId, $FieldName);
652 # if field value is array 653 if (is_array($FieldValue))
655 # concatenate together text from array elements 656 $FieldValue = implode(
" ", $FieldValue);
659 # normalize text and break into word array 660 self::$ItemDataCache[$ItemId][$FieldName] =
664 # return cached data to caller 665 return self::$ItemDataCache[$ItemId][$FieldName];
678 if ($this->
DebugLevel > 10) { print(
"REC: calculating correlation" 679 .
" between items $ItemIdA and $ItemIdB<br>\n"); }
681 # order item ID numbers 682 if ($ItemIdA > $ItemIdB)
689 # if we already have the correlation 690 if (isset(self::$CorrelationCache[$ItemIdA][$ItemIdB]))
692 # retrieve correlation from cache 693 $TotalCorrelation = self::$CorrelationCache[$ItemIdA][$ItemIdB];
697 # if list of fields to correlate specified 698 if ($FieldList != NULL)
700 # create list with only specified fields 701 foreach ($FieldList as $FieldName)
703 $ContentFields[$FieldName] = $this->ContentFields[$FieldName];
709 $ContentFields = $this->ContentFields;
712 # for each content field 713 $TotalCorrelation = 0;
714 foreach ($ContentFields as $FieldName => $FieldAttributes)
716 # if field is of a type that we use for correlation 717 $FieldType = intval($FieldAttributes[
"FieldType"]);
718 if (($FieldType == self::CONTENTFIELDTYPE_TEXT)
719 || ($FieldType == self::CONTENTFIELDTYPE_CONTROLLEDNAME))
726 print
"REC: loaded ".count($ItemAData)
727 .
" terms for item #".$ItemIdA.
" and " 728 .count($ItemBData).
" terms for item #" 729 .$ItemIdB.
" for field \"".$FieldName.
"\"<br>\n";
732 # call appropriate routine to get correlation 735 case self::CONTENTFIELDTYPE_TEXT:
736 case self::CONTENTFIELDTYPE_CONTROLLEDNAME:
738 $ItemAData, $ItemBData);
742 # add correlation multiplied by weight to total 743 $TotalCorrelation += $Correlation * $FieldAttributes[
"Weight"];
747 # store correlation to cache 748 self::$CorrelationCache[$ItemIdA][$ItemIdB] = $TotalCorrelation;
751 # return correlation value to caller 754 print(
"REC: correlation between items $ItemIdA and $ItemIdB" 755 .
" found to be $TotalCorrelation<br>\n");
757 return $TotalCorrelation;
767 if ($this->
DebugLevel > 6) { print(
"REC: updating correlation between" 768 .
" items $ItemIdA and $ItemIdB<br>\n"); }
770 # bail out if two items are the same 771 if ($ItemIdA == $ItemIdB) {
return; }
773 # calculate correlation 776 # save new correlation 855 # strip any HTML tags 856 $Text = strip_tags($Text);
858 # strip any punctuation 859 $Text = preg_replace(
"/,\\.\\?-\\(\\)\\[\\]\"/",
" ", $Text); #
" 861 # normalize whitespace 862 $Text = trim(preg_replace("/[\\s]+/
", " ", $Text)); 864 # convert to all lower case 865 $Text = strtolower($Text); 867 # split text into arrays of words 868 $Words = explode(" ", $Text); 870 # filter out all stop words 871 $Words = array_diff($Words, $StopWords); 873 # return word array to caller 883 protected function CalcTextCorrelation($WordsA, $WordsB) 885 # get array containing intersection of two word arrays 886 $IntersectWords = array_intersect($WordsA, $WordsB); 888 # return number of words remaining as score 889 return count($IntersectWords); 899 protected function ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation = -1) 901 # if item ID A is greater than item ID B 902 if ($ItemIdA > $ItemIdB) 910 # if new correlation value provided 911 if ($NewCorrelation != -1) 913 # if new value is above threshold 914 if ($NewCorrelation >= $this->ContentCorrelationThreshold) 916 # insert new correlation value in DB 917 $this->DB->Query("INSERT INTO RecContentCorrelations
" 918 ."(ItemIdA, ItemIdB, Correlation)
" 919 ."VALUES (${ItemIdA}, ${ItemIdB}, ${NewCorrelation})
"); 921 # return correlation value is new value 922 $Correlation = $NewCorrelation; 927 # return value is zero 933 # retrieve correlation value from DB 934 $Correlation = $this->DB->Query( 935 "SELECT Correlation FROM RecContentCorrelations
" 936 ."WHERE ItemIdA = ${ItemIdA} AND ItemIdB = ${ItemIdB}
", 939 # if no value found in DB 940 if ($Correlation == FALSE) 942 # return value is zero 947 # return correlation value to caller 956 protected function FilterOnSuppliedFunctions($Results) 958 # if filter functions have been set 959 if (count($this->FilterFuncs) > 0) 962 foreach ($Results as $ResourceId => $Result) 964 # for each filter function 965 foreach ($this->FilterFuncs as $FuncName) 967 # if filter protected function return TRUE for result resource 968 if ($FuncName($ResourceId)) 971 if ($this->DebugLevel > 2) 973 print("REC: filter callback rejected resource
" 974 ." ${ResourceId}<br>\n
"); 976 unset($Results[$ResourceId]); 978 # bail out of filter func loop 985 # return filtered list to caller DebugLevel($Setting)
Set level for debugging output.
RecommendFieldValues($ItemId, $FieldList=NULL)
Dynamically generate and return list of recommended field values for item.
UpdateForItems($StartingItemId, $NumberOfItems)
Update recommender data for range of items.
GetSourceList($UserId, $RecommendedItemId)
Return list of items used to generate recommendation of specified item.
ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation=-1)
Get/set stored value for correlation between two items.
const CONTENTFIELDTYPE_CONTROLLEDNAME
AddResultFilterFunction($FunctionName)
Add function to be called to filter returned recommendation list.
const CONTENTFIELDTYPE_DATE
__construct(&$DB, $ItemTableName, $RatingTableName, $ItemIdFieldName, $UserIdFieldName, $RatingFieldName, $ContentFields)
Object constructor.
FilterOnSuppliedFunctions($Results)
Run results through supplied filter functions.
SearchTime()
Get time it took to generate the most recent recommendation.
PruneCorrelations()
Prune any stored correlation values that are below-average.
GetItemIds()
Retrieve all item IDs.
UpdateForItem($ItemId, $FullPass=FALSE)
Update recommender data for specified item.
NumberOfResults()
Get number of recommendations generated.
const CONTENTFIELDTYPE_NUMERIC
GetFieldData($ItemId, $FieldName)
Get data for field.
UpdateContentCorrelation($ItemIdA, $ItemIdB)
Calculate content correlation between two items and update in DB.
Recommend($UserId, $StartingResult=0, $NumberOfResults=10)
Recommend items for specified user.
CalcTextCorrelation($WordsA, $WordsB)
Get value for correlation between two sets of words.
LoadItemIds()
Load internal item ID cache (if not already loaded).
const CONTENTFIELDTYPE_TEXT
FindSimilarItems($ItemId, $FieldList=NULL)
Dynamically generate and return list of items similar to specified item.
const CONTENTFIELDTYPE_DATERAMGE
NormalizeAndParseText($Text)
Normalize text string and parse into words.
CalculateContentCorrelation($ItemIdA, $ItemIdB, $FieldList=NULL)
Calculate content correlation between two items and return value to caller.
DropItem($ItemId)
Drop item from stored recommender data.
static ClearCaches()
Clear internal caches of item and correlation data.