4 # Provides a client for pulling data from OAI-PMH providers 5 # For protocol documentation, see: 6 # http://www.openarchives.org/OAI/openarchivesprotocol.html 9 # OAIClient(ServerUrl, Cache) 12 # - Change the base url of the remote repository 13 # MetadataPrefix($pfx) 14 # - Set the schema we will request from remote 16 # - Restrict queries to a single set 18 # http://www.openarchives.org/OAI/openarchivesprotocol.html#Set 20 # - Fetch identifying information about the remote repository 22 # - Fetch information about what schemas remote can serve 23 # GetRecords($start,$end) 24 # - Pull records in batches, optionally with date restrictions 26 # - Pull a single record using a unique identifier 27 # MoreRecordsAvailable() 28 # - Determine if a batch pull is complete or not 29 # ResetRecordPointer() 30 # - Restart a batch pull from the beginning 32 # - Determine verbosity 34 # Copyright 2014 Edward Almasy and Internet Scout 35 # http://scout.wisc.edu 41 # ---- PUBLIC INTERFACE -------------------------------------------------- 51 # set default debug level 52 $this->DebugLevel = 0;
57 # set default metadata prefix 60 # set default set specification for queries 63 $this->CacheSequenceNumber = 0;
66 $this->Cache = $Cache;
81 if ($NewValue != NULL)
85 return $this->ServerUrl;
95 if ($NewValue != NULL)
99 return $this->MetadataPrefix;
107 public function SetSpec($NewValue =
"X-NOSETSPECVALUE-X")
109 if ($NewValue !=
"X-NOSETSPECVALUE-X")
113 return $this->SetSpec;
125 # query server for XML text 126 $XmlText = $this->PerformQuery(
"Identify");
127 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
129 # convert XML text into object 130 $Xml = simplexml_load_string($XmlText);
131 $this->DebugOutVar(9, __METHOD__,
"Xml", $Xml);
133 # if identification info was found 135 if (isset($Xml->Identify))
138 $Ident = $Xml->Identify;
139 $this->GetValFromXml($Ident,
"repositoryName",
"Name", $Info);
140 $this->GetValFromXml($Ident,
"adminEmail",
"Email", $Info);
141 $this->GetValFromXml($Ident,
"baseURL",
"URL", $Info);
144 # return info to caller 155 # query server for XML text 156 $XmlText = $this->PerformQuery(
"ListMetadataFormats");
157 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
159 # convert XML text into object 160 $Xml = simplexml_load_string($XmlText);
161 $this->DebugOutVar(9, __METHOD__,
"Xml", $Xml);
163 # if format info was found 165 if (isset($Xml->ListMetadataFormats->metadataFormat))
169 foreach ($Xml->ListMetadataFormats->metadataFormat as $Format)
171 $this->GetValFromXml(
172 $Format,
"metadataPrefix",
"Name", $Formats[$Index]);
173 $this->GetValFromXml(
174 $Format,
"schema",
"Schema", $Formats[$Index]);
175 $this->GetValFromXml(
176 $Format,
"metadataNamespace",
"Namespace",
182 # return info to caller 193 public function GetRecords($StartDate = NULL, $EndDate = NULL)
195 # if we're using a cache directory, figure out which file 196 # should contain this set of records 197 if ($this->Cache !== NULL)
199 $cache_fname = sprintf(
"%s/%010x",
201 $this->CacheSequenceNumber);
202 $this->CacheSequenceNumber++;
205 # when we're not using a cache or don't have a cached copy of 206 # this set of records, query the OAI provider to get it 207 if ($this->Cache === NULL || !file_exists($cache_fname) )
209 # if we have resumption token from prior query 210 if (isset($this->ResumptionToken))
212 # use resumption token as sole argument 213 $Args[
"resumptionToken"] = $this->ResumptionToken;
217 # set up arguments for query 218 $Args[
"metadataPrefix"] = $this->MetadataPrefix;
219 if ($StartDate) { $Args[
"from"] = $StartDate; }
220 if ($EndDate) { $Args[
"until"] = $EndDate; }
221 if ($this->
SetSpec) { $Args[
"set"] = $this->SetSpec; }
224 # query server for XML text 225 $XmlText = $this->PerformQuery(
"ListRecords", $Args);
227 # if a cache is in use, save this chunk of XML into it 228 if ($this->Cache !== NULL)
230 file_put_contents($cache_fname, $XmlText);
235 # get XML text from the cache 236 $XmlText = file_get_contents($cache_fname);
239 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
241 return $this->GetRecordsFromXML($XmlText,
"ListRecords");
260 $Args[
"metadataPrefix"] = $this->MetadataPrefix;
261 $Args[
"identifier"] = $Id;
263 # query server for XML text 264 $XmlText = $this->PerformQuery(
"GetRecord", $Args);
265 $this->DebugOutVar(8, __METHOD__,
"XmlText", htmlspecialchars($XmlText));
267 return $this->GetRecordsFromXML($XmlText,
"GetRecord");
277 return isset($this->ResumptionToken) ? TRUE : FALSE;
285 unset($this->ResumptionToken);
286 $this->CacheSequenceNumber = 0;
296 $this->DebugLevel = $NewLevel;
300 # ---- PRIVATE INTERFACE ------------------------------------------------- 303 private $MetadataPrefix;
306 private $ResumptionToken;
308 private $CacheSequenceNumber;
316 private function PerformQuery($QueryVerb, $Args = NULL)
318 # open stream to OAI server 320 if (strpos($this->
ServerUrl,
"?") === FALSE)
322 $QueryUrl = $this->
ServerUrl.
"?verb=".$QueryVerb;
326 $QueryUrl = $this->
ServerUrl.
"&verb=".$QueryVerb;
331 foreach ($Args as $ArgName => $ArgValue)
333 $QueryUrl .=
"&".urlencode($ArgName).
"=".urlencode($ArgValue);
336 $FHndl = fopen($QueryUrl,
"r");
338 # if stream was successfully opened 340 if ($FHndl !== FALSE)
342 # while lines left in response 343 while (!feof($FHndl))
345 # read line from server and add it to text to be parsed 346 $Text .= fread($FHndl, 10000000);
350 # close OAI server stream 353 # return query result data to caller 364 private function GetValFromXml($Xml, $SrcName, $DstName, &$Results)
366 if (isset($Xml->$SrcName))
368 $Results[$DstName] = trim($Xml->$SrcName);
379 private function DebugOutVar($Level, $MethodName, $VarName, $VarValue)
381 if ($this->DebugLevel >= $Level)
383 print(
"\n<pre>".$MethodName.
"() ".$VarName.
" = \n");
452 private function ExtractDataFromXml(&$Records, $Index, DOMNode $dom,
453 $Section, $ParentTagName=NULL)
455 foreach ($dom->childNodes as $node)
457 # for DOM children that are elements (rather than comments, text, 459 if ($node->nodeType == XML_ELEMENT_NODE)
461 # compute a tag name to use 463 (($ParentTagName!==NULL) ? $ParentTagName.
"/" :
"")
466 # Glue together the contents of the 'text' children of this node 468 foreach ($node->childNodes as $child)
470 if ($child->nodeType == XML_TEXT_NODE)
472 $Value .= $child->nodeValue;
476 # if we had a non-empty value, add it to the results 477 if (strlen(trim($Value))>0)
479 $Records[$Index][$Section][$StorageTagName][]= $Value;
482 # and process our children 483 $this->ExtractDataFromXml($Records, $Index,
484 $node, $Section, $StorageTagName);
496 private function GetFirstElement(DOMNode $dom)
498 foreach ($dom->childNodes as $child)
500 if ($child->nodeType == XML_ELEMENT_NODE)
525 private function GetRecordsFromXML($XmlText, $ParseTo)
527 # create XML parser and pass it text 528 $Xml = simplexml_load_string($XmlText);
530 # if text could not be parsed, return NULL 531 if (! $Xml instanceof SimpleXmlElement )
536 # set up vars to hold our results 540 # we'll want to find our records with XPath, so we need to 541 # register a prefix for the oai elements 542 $Xml->registerXPathNamespace(
'oai',
"http://www.openarchives.org/OAI/2.0/");
544 # extract records, iterate over them 545 $RecordXML = $Xml->xpath(
"oai:".$ParseTo.
"//oai:record");
546 foreach ($RecordXML as $Record)
548 # pull relevant information out of the header 550 # Note that SimpleXMLElement objects map elements onto PHP 551 # object properties, and will return a SimpleXMLElement w/o 552 # any associated XML for non-existent elements. So, 553 # nothing explodes when we ask the Record for an element it 556 # However, SimpleXMLElements w/o associated XML return 557 # 'NULL' for all properties. Therefore, if we tried to 558 # look at the grandchild of a non-existent element it would 559 # be problematic. In the cases below, we get empty 560 # strings when the children of 'header' &c are empty, which 561 # is what we want anyway. 563 $Records[$Index][
"identifier"] = (string)$Record->header->identifier;
564 $Records[$Index][
"datestamp"] = (
string)$Record->header->datestamp;
566 # grab associated meadata (if there is any) 567 if ($Record->metadata->count() > 0)
569 # to avoid frustrations with namespaces and SimpleXML, use 570 # DOMDocument to parse the record data 571 $doc = dom_import_simplexml( $Record->metadata );
573 # get the 'record' element 574 $doc = $this->GetFirstElement( $doc );
576 # record the format used for this record 577 $Records[$Index][
"format"] = $doc->nodeName;
579 # extract data for this record 580 $this->ExtractDataFromXml( $Records, $Index, $doc,
"metadata" );
583 # if there is additional information available, snag that too 584 if ($Record->about->count() > 0)
586 $doc = dom_import_simplexml( $Record->about );
587 $this->ExtractDataFromXml($Records, $Index, $doc,
"about");
590 # move along to the next record 594 # look for resumption token and save if found (as above, we'll 595 # get an empty string if either ListRecords or resumptionToken 597 $Token = (string)$Xml->ListRecords->resumptionToken;
599 if (strlen($Token)>0)
601 $this->ResumptionToken = $Token;
605 unset($this->ResumptionToken);
608 # return records to caller
ResetRecordPointer()
Clear any additional records available after last GetRecords().
ServerUrl($NewValue=NULL)
Get or set URL of target OAI repository server.
GetRecord($Id)
Get a single record from a repositry server.
__construct($ServerUrl, $Cache=NULL)
Class constructor.
MoreRecordsAvailable()
Check whether more records are available after last GetRecords().
GetRecords($StartDate=NULL, $EndDate=NULL)
Retrieve records from repository server.
MetadataPrefix($NewValue=NULL)
Get or set metadata schema for records being retrieved.
SetSpec($NewValue="X-NOSETSPECVALUE-X")
Get or set specification of subset of records to be retrieved.
GetIdentification()
Retrieve identification information from repository server.
SetDebugLevel($NewLevel)
Set current debug output level.
GetFormats()
Retrieve list of available metadata formats from repository server.