00001 <?PHP
00002
00003 #
00004 # FILE: Scout--OAIClient.php
00005 # Provides a client for pulling data from OAI-PMH providers
00006 # For protocol documentation, see:
00007 # http://www.openarchives.org/OAI/openarchivesprotocol.html
00008 #
00009 # METHODS PROVIDED:
00010 # OAIClient(ServerUrl, Cache)
00011 # - constructor
00012 # ServerUrl(NewValue)
00013 # - Change the base url of the remote repository
00014 # MetadataPrefix($pfx)
00015 # - Set the schema we will request from remote
00016 # SetSpec($set)
00017 # - Restrict queries to a single set
00018 # for details, see
00019 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set
00020 # GetIdentification()
00021 # - Fetch identifying information about the remote repository
00022 # GetFormats()
00023 # - Fetch information about what schemas remote can serve
00024 # GetRecords($start,$end)
00025 # - Pull records in batches, optionally with date restrictions
00026 # GetRecord($id)
00027 # - Pull a single record using a unique identifier
00028 # MoreRecordsAvailable()
00029 # - Determine if a batch pull is complete or not
00030 # ResetRecordPointer()
00031 # - Restart a batch pull from the beginning
00032 # SetDebugLevel()
00033 # - Determine verbosity
00034 #
00035 # Copyright 2008 Edward Almasy and Internet Scout
00036 # http://scout.wisc.edu
00037 #
00038
00039 require_once("XMLParser.php");
00040
00041
00042 class OAIClient {
00043
00044 # ---- PUBLIC INTERFACE --------------------------------------------------
00045
00052 function OAIClient($ServerUrl, $Cache=NULL)
00053 {
00054 # set default debug level
00055 $this->DebugLevel = 0;
00056
00057 # save OAI server URL
00058 $this->ServerUrl = $ServerUrl;
00059
00060 # set default metadata prefix
00061 $this->MetadataPrefix = "oai_dc";
00062
00063 # set default set specification for queries
00064 $this->SetSpec = NULL;
00065
00066 $this->CacheSequenceNumber = 0;
00067 if ($Cache !== NULL)
00068 {
00069 $this->Cache = $Cache;
00070 $this->UsingCache = is_dir($Cache);
00071 if ($this->UsingCache == FALSE )
00072 {
00073 mkdir($Cache);
00074 }
00075 }
00076 }
00077
00084 function ServerUrl($NewValue = NULL)
00085 {
00086 if ($NewValue != NULL)
00087 {
00088 $this->ServerUrl = $NewValue;
00089 }
00090 return $this->ServerUrl;
00091 }
00092
00099 function MetadataPrefix($NewValue = NULL)
00100 {
00101 if ($NewValue != NULL)
00102 {
00103 $this->MetadataPrefix = $NewValue;
00104 }
00105 return $this->MetadataPrefix;
00106 }
00107
00114 function SetSpec($NewValue = "X-NOSETSPECVALUE-X")
00115 {
00116 if ($NewValue != "X-NOSETSPECVALUE-X")
00117 {
00118 $this->SetSpec = $NewValue;
00119 }
00120 return $this->SetSpec;
00121 }
00122
00130 function GetIdentification()
00131 {
00132 # query server for XML text
00133 $XmlText = $this->PerformQuery("Identify");
00134 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText));
00135
00136 # convert XML text into object
00137 $Xml = simplexml_load_string($XmlText);
00138 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
00139
00140 # if identification info was found
00141 $Info = array();
00142 if (isset($Xml->Identify))
00143 {
00144 # extract info
00145 $Ident = $Xml->Identify;
00146 $this->GetValFromXml($Ident, "repositoryName", "Name", $Info);
00147 $this->GetValFromXml($Ident, "adminEmail", "Email", $Info);
00148 $this->GetValFromXml($Ident, "baseURL", "URL", $Info);
00149 }
00150
00151 # return info to caller
00152 return $Info;
00153 }
00154
00160 function GetFormats()
00161 {
00162 # query server for XML text
00163 $XmlText = $this->PerformQuery("ListMetadataFormats");
00164 $this->DebugOutVar(8,__METHOD__,"XmlText",htmlspecialchars($XmlText));
00165
00166 # convert XML text into object
00167 $Xml = simplexml_load_string($XmlText);
00168 $this->DebugOutVar(9, __METHOD__, "Xml", $Xml);
00169
00170 # if format info was found
00171 $Formats = array();
00172 if (isset($Xml->ListMetadataFormats->metadataFormat))
00173 {
00174 # extract info
00175 $Index = 0;
00176 foreach ($Xml->ListMetadataFormats->metadataFormat as $Format)
00177 {
00178 $this->GetValFromXml(
00179 $Format, "metadataPrefix", "Name", $Formats[$Index]);
00180 $this->GetValFromXml(
00181 $Format, "schema", "Schema", $Formats[$Index]);
00182 $this->GetValFromXml(
00183 $Format, "metadataNamespace", "Namespace",
00184 $Formats[$Index]);
00185 $Index++;
00186 }
00187 }
00188
00189 # return info to caller
00190 return $Formats;
00191 }
00192
00200 function GetRecords($StartDate = NULL, $EndDate = NULL)
00201 {
00202 if( $this->Cache != NULL )
00203 {
00204 $cache_fname = sprintf("%s/%010x",
00205 $this->Cache,
00206 $this->CacheSequenceNumber);
00207 $this->CacheSequenceNumber++;
00208 }
00209
00210 if( $this->Cache == NULL or $this->UsingCache == FALSE )
00211 {
00212 # if we have resumption token from prior query
00213 if (isset($this->ResumptionToken))
00214 {
00215 # use resumption token as sole argument
00216 $Args["resumptionToken"] = $this->ResumptionToken;
00217 }
00218 else
00219 {
00220 # set up arguments for query
00221 $Args["metadataPrefix"] = $this->MetadataPrefix;
00222 if ($StartDate) { $Args["from"] = $StartDate; }
00223 if ($EndDate) { $Args["until"] = $EndDate; }
00224 if ($this->SetSpec) { $Args["set"] = $this->SetSpec; }
00225 }
00226
00227 # query server for XML text
00228 $XmlText = $this->PerformQuery("ListRecords", $Args);
00229
00230 if( $this->Cache != NULL )
00231 {
00232 file_put_contents( $cache_fname, $XmlText );
00233 }
00234 }
00235 else
00236 {
00237 # Get XML text from the cache
00238 $XmlText = file_get_contents( $cache_fname );
00239 }
00240
00241 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText));
00242
00243 return $this->GetRecordsFromXML($XmlText, "listrecords" );
00244 }
00245
00260 function GetRecord($Id)
00261 {
00262 $Args["metadataPrefix"] = $this->MetadataPrefix;
00263 $Args["identifier"] = $Id;
00264
00265 # query server for XML text
00266 $XmlText = $this->PerformQuery("GetRecord", $Args);
00267 $this->DebugOutVar(8, __METHOD__,"XmlText",htmlspecialchars($XmlText));
00268
00269 return $this->GetRecordsFromXML($XmlText, "getrecord" );
00270 }
00271
00277 function MoreRecordsAvailable()
00278 {
00279 return isset($this->ResumptionToken) ? TRUE : FALSE;
00280 }
00281
00285 function ResetRecordPointer()
00286 {
00287 unset($this->ResumptionToken);
00288 $this->CacheSequenceNumber = 0;
00289 }
00290
00296 function SetDebugLevel($NewLevel)
00297 {
00298 $this->DebugLevel = $NewLevel;
00299 }
00300
00301
00302 # ---- PRIVATE INTERFACE -------------------------------------------------
00303
00304 private $ServerUrl;
00305 private $MetadataPrefix;
00306 private $SetSpec;
00307 private $DebugLevel;
00308 private $ResumptionToken;
00309 private $Cache;
00310 private $UsingCache;
00311 private $CacheSequenceNumber;
00312
00313 # perform OAI query and return resulting data to caller
00314 private function PerformQuery($QueryVerb, $Args = NULL)
00315 {
00316 # open stream to OAI server
00317
00318 if (strpos($this->ServerUrl, "?") === FALSE)
00319 {
00320 $QueryUrl = $this->ServerUrl."?verb=".$QueryVerb;
00321 }
00322 else
00323 {
00324 $QueryUrl = $this->ServerUrl."&verb=".$QueryVerb;
00325 }
00326
00327 if ($Args)
00328 {
00329 foreach ($Args as $ArgName => $ArgValue)
00330 {
00331 $QueryUrl .= "&".urlencode($ArgName)."=".urlencode($ArgValue);
00332 }
00333 }
00334 $FHndl = fopen($QueryUrl, "r");
00335
00336 # if stream was successfully opened
00337 $Text = "";
00338 if ($FHndl !== FALSE)
00339 {
00340 # while lines left in response
00341 while (!feof($FHndl))
00342 {
00343 # read line from server and add it to text to be parsed
00344 $Text .= fread($FHndl, 10000000);
00345 }
00346 }
00347
00348 # close OAI server stream
00349 fclose($FHndl);
00350
00351 # return query result data to caller
00352 return $Text;
00353 }
00354
00355 # set array value if available in simplexml object
00356 private function GetValFromXml($Xml, $SrcName, $DstName, &$Results)
00357 {
00358 if (isset($Xml->$SrcName))
00359 {
00360 $Results[$DstName] = trim($Xml->$SrcName);
00361 }
00362 }
00363
00364 # print variable contents if debug is above specified level
00365 private function DebugOutVar($Level, $MethodName, $VarName, $VarValue)
00366 {
00367 if ($this->DebugLevel >= $Level)
00368 {
00369 print("\n<pre>".$MethodName."() ".$VarName." = \n");
00370 print_r($VarValue);
00371 print("</pre>\n");
00372 }
00373 }
00374
00375 # Recursively dump tags inside a metadata section, flattening them
00376 # as we go.
00377 private function DumpTagsRecursive(&$Records, $Index, $Parser, $ParentTagName=NULL)
00378 {
00379 $TagName = $Parser->GetTagName();
00380 do
00381 {
00382 $StorageTagName = ($ParentTagName!==NULL) ?
00383 $ParentTagName."/".$TagName : $TagName;
00384
00385 if ($Parser->SeekToChild() ){
00386 $this->DumpTagsRecursive( $Records, $Index, $Parser, $StorageTagName );
00387 $Parser->SeekToParent();
00388 }
00389 else
00390 {
00391 $Records[$Index]["metadata"][$StorageTagName][] = $Parser->GetData();
00392 }
00393 } while ($TagName = $Parser->NextTag());
00394 }
00395
00396 # Query has been sent, we need to retrieve records that came from it.
00397 private function GetRecordsFromXML($XmlText, $ParseTo ){
00398 # create XML parser and pass it text
00399 $Parser = new XMLParser();
00400 $Parser->ParseText($XmlText);
00401
00402 $this->DebugOutVar(9, __METHOD__, "Parser", $Parser);
00403
00404 # if records were found
00405 $Records = array();
00406 $ItemCount = $Parser->SeekTo("oai-pmh", $ParseTo, "record");
00407 if ($ItemCount)
00408 {
00409 # for each record
00410 $Index = 0;
00411 do
00412 {
00413 # grab record identifier and date
00414 $Records[$Index]["identifier"]=$Parser->GetData("header",
00415 "identifier");
00416 $Records[$Index]["datestamp"]=$Parser->GetData("header",
00417 "datestamp");
00418
00419 # grab metadata
00420 $SeekResult = $Parser->SeekTo("metadata");
00421 if ($SeekResult)
00422 {
00423 $SeekResult = $Parser->SeekToChild();
00424 if ($SeekResult)
00425 {
00426 $Records[$Index]["format"] = $Parser->GetTagName();
00427 $SeekResult = $Parser->SeekToChild();
00428 if ($SeekResult)
00429 {
00430 $this->DumpTagsRecursive($Records, $Index, $Parser);
00431 $Parser->SeekToParent();
00432 }
00433 $Parser->SeekToParent();
00434 }
00435 $Parser->SeekToParent();
00436 }
00437
00438 # grab search info (if any)
00439 $SeekResult = $Parser->SeekTo("about");
00440 if ($SeekResult)
00441 {
00442 $SeekResult = $Parser->SeekTo("searchInfo");
00443 if ($SeekResult)
00444 {
00445 $SeekResult = $Parser->SeekToChild();
00446 if ($SeekResult)
00447 {
00448 $TagName = $Parser->GetTagName();
00449 do
00450 {
00451 $Records[$Index]["about"]["SEARCHINFO"][$TagName][] =
00452 $Parser->GetData();
00453 } while ($TagName = $Parser->NextTag());
00454 $Parser->SeekToParent();
00455 }
00456 $Parser->SeekToParent();
00457 }
00458 $Parser->SeekToParent();
00459 }
00460
00461 $Index++;
00462 }
00463 while ($Parser->NextItem());
00464 }
00465
00466 # look for resumption token and save if found
00467 $Parser->SeekToRoot();
00468 $SeekResult = $Parser->SeekTo(
00469 "oai-pmh", "listrecords", "resumptiontoken");
00470 if ($SeekResult !== NULL)
00471 {
00472 $this->ResumptionToken = $Parser->GetData();
00473 }
00474 else
00475 {
00476 unset($this->ResumptionToken);
00477 }
00478
00479 # return records to caller
00480 return $Records;
00481 }
00482
00483 }
00484
00485 ?>