rss_fetch.inc

  1: <?php
  2: /*
  3:  * Project:     MagpieRSS: a simple RSS integration tool
  4:  * File:        rss_fetch.inc, a simple functional interface
  5:                 to fetching and parsing RSS files, via the
  6:                 function fetch_rss()
  7:  * Author:      Kellan Elliott-McCrea <kellan@protest.net>
  8:  * License:     GPL
  9:  *
 10:  * The lastest version of MagpieRSS can be obtained from:
 11:  * http://magpierss.sourceforge.net
 12:  *
 13:  * For questions, help, comments, discussion, etc., please join the
 14:  * Magpie mailing list:
 15:  * magpierss-general@lists.sourceforge.net
 16:  *
 17:  */
 18: 
 19: // Setup MAGPIE_DIR for use on hosts that don't include
 20: // the current path in include_path.
 21: // with thanks to rajiv and smarty
 22: if (!defined('DIR_SEP')) {
 23:     define('DIR_SEP', DIRECTORY_SEPARATOR);
 24: }
 25: 
 26: if (!defined('MAGPIE_DIR')) {
 27:     define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP);
 28: }
 29: 
 30: require_once( MAGPIE_DIR . 'rss_parse.inc' );
 31: require_once( MAGPIE_DIR . 'rss_cache.inc' );
 32: 
 33: // for including 3rd party libraries
 34: define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP);
 35: require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc');
 36: 
 37: 
 38: /*
 39:  * CONSTANTS - redefine these in your script to change the
 40:  * behaviour of fetch_rss() currently, most options effect the cache
 41:  *
 42:  * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects?
 43:  * For me a built in cache was essential to creating a "PHP-like"
 44:  * feel to Magpie, see rss_cache.inc for rationale
 45:  *
 46:  *
 47:  * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects?
 48:  * This should be a location that the webserver can write to.   If this
 49:  * directory does not already exist Mapie will try to be smart and create
 50:  * it.  This will often fail for permissions reasons.
 51:  *
 52:  *
 53:  * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds.
 54:  *
 55:  *
 56:  * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error
 57:  * instead of returning stale object?
 58:  *
 59:  * MAGPIE_DEBUG - Display debugging notices?
 60:  *
 61: */
 62: 
 63: 
 64: /*=======================================================================*\
 65:     Function: fetch_rss:
 66:     Purpose:  return RSS object for the give url
 67:               maintain the cache
 68:     Input:    url of RSS file
 69:     Output:   parsed RSS object (see rss_parse.inc)
 70: 
 71:     NOTES ON CACHEING:
 72:     If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache.
 73: 
 74:     NOTES ON RETRIEVING REMOTE FILES:
 75:     If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will
 76:     return a cached object, and touch the cache object upon recieving a
 77:     304.
 78: 
 79:     NOTES ON FAILED REQUESTS:
 80:     If there is an HTTP error while fetching an RSS object, the cached
 81:     version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off)
 82: \*=======================================================================*/
 83: 
 84: define('MAGPIE_VERSION', '0.72');
 85: 
 86: $MAGPIE_ERROR = "";
 87: 
 88: function fetch_rss ($url) {
 89:     // initialize constants
 90:     init();
 91: 
 92:     if ( !isset($url) ) {
 93:         error("fetch_rss called without a url");
 94:         return false;
 95:     }
 96: 
 97:     // if cache is disabled
 98:     if ( !MAGPIE_CACHE_ON ) {
 99:         // fetch file, and parse it
100:         $resp = _fetch_remote_file( $url );
101:         if ( is_success( $resp->status ) ) {
102:             return _response_to_rss( $resp );
103:         }
104:         else {
105:             error("Failed to fetch $url and cache is off");
106:             return false;
107:         }
108:     }
109:     // else cache is ON
110:     else {
111:         // Flow
112:         // 1. check cache
113:         // 2. if there is a hit, make sure its fresh
114:         // 3. if cached obj fails freshness check, fetch remote
115:         // 4. if remote fails, return stale object, or error
116: 
117:         $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE );
118: 
119:         if (MAGPIE_DEBUG and $cache->ERROR) {
120:             debug($cache->ERROR, E_USER_WARNING);
121:         }
122: 
123: 
124:         $cache_status    = 0;       // response of check_cache
125:         $request_headers = array(); // HTTP headers to send with fetch
126:         $rss             = 0;       // parsed RSS object
127:         $errormsg        = 0;       // errors, if any
128: 
129:         // store parsed XML by desired output encoding
130:         // as character munging happens at parse time
131:         $cache_key       = $url . MAGPIE_OUTPUT_ENCODING;
132: 
133:         if (!$cache->ERROR) {
134:             // return cache HIT, MISS, or STALE
135:             $cache_status = $cache->check_cache( $cache_key);
136:         }
137: 
138:         // if object cached, and cache is fresh, return cached obj
139:         if ( $cache_status == 'HIT' ) {
140:             $rss = $cache->get( $cache_key );
141:             if ( isset($rss) and $rss ) {
142:                 // should be cache age
143:                 $rss->from_cache = 1;
144:                 if ( MAGPIE_DEBUG > 1) {
145:                     debug("MagpieRSS: Cache HIT", E_USER_NOTICE);
146:                 }
147:                 return $rss;
148:             }
149:         }
150: 
151:         // else attempt a conditional get
152: 
153:         // setup headers
154:         if ( $cache_status == 'STALE' ) {
155:             $rss = $cache->get( $cache_key );
156:             if ( $rss and $rss->etag and $rss->last_modified ) {
157:                 $request_headers['If-None-Match'] = $rss->etag;
158:                 $request_headers['If-Last-Modified'] = $rss->last_modified;
159:             }
160:         }
161: 
162:         $resp = _fetch_remote_file( $url, $request_headers );
163: 
164:         if (isset($resp) and $resp) {
165:           if ($resp->status == '304' ) {
166:                 // we have the most current copy
167:                 if ( MAGPIE_DEBUG > 1) {
168:                     debug("Got 304 for $url");
169:                 }
170:                 // reset cache on 304 (at minutillo insistent prodding)
171:                 $cache->set($cache_key, $rss);
172:                 return $rss;
173:             }
174:             elseif ( is_success( $resp->status ) ) {
175:                 $rss = _response_to_rss( $resp );
176:                 if ( $rss ) {
177:                     if (MAGPIE_DEBUG > 1) {
178:                         debug("Fetch successful");
179:                     }
180:                     // add object to cache
181:                     $cache->set( $cache_key, $rss );
182:                     return $rss;
183:                 }
184:             }
185:             else {
186:                 $errormsg = "Failed to fetch $url ";
187:                 if ( $resp->status == '-100' ) {
188:                     $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)";
189:                 }
190:                 elseif ( $resp->error ) {
191:                     # compensate for Snoopy's annoying habbit to tacking
192:                     # on '\n'
193:                     $http_error = substr($resp->error, 0, -2);
194:                     $errormsg .= "(HTTP Error: $http_error)";
195:                 }
196:                 else {
197:                     $errormsg .=  "(HTTP Response: " . $resp->response_code .')';
198:                 }
199:             }
200:         }
201:         else {
202:             $errormsg = "Unable to retrieve RSS file for unknown reasons.";
203:         }
204: 
205:         // else fetch failed
206: 
207:         // attempt to return cached object
208:         if ($rss) {
209:             if ( MAGPIE_DEBUG ) {
210:                 debug("Returning STALE object for $url");
211:             }
212:             return $rss;
213:         }
214: 
215:         // else we totally failed
216:         error( $errormsg );
217: 
218:         return false;
219: 
220:     } // end if ( !MAGPIE_CACHE_ON ) {
221: } // end fetch_rss()
222: 
223: /*=======================================================================*\
224:     Function:   error
225:     Purpose:    set MAGPIE_ERROR, and trigger error
226: \*=======================================================================*/
227: 
228: function error ($errormsg, $lvl=E_USER_WARNING) {
229:         global $MAGPIE_ERROR;
230: 
231:         // append PHP's error message if track_errors enabled
232:         if ( isset($php_errormsg) ) {
233:             $errormsg .= " ($php_errormsg)";
234:         }
235:         if ( $errormsg ) {
236:             $errormsg = "MagpieRSS: $errormsg";
237:             $MAGPIE_ERROR = $errormsg;
238:             trigger_error( $errormsg, $lvl);
239:         }
240: }
241: 
242: function debug ($debugmsg, $lvl=E_USER_NOTICE) {
243:     trigger_error("MagpieRSS [debug] $debugmsg", $lvl);
244: }
245: 
246: /*=======================================================================*\
247:     Function:   magpie_error
248:     Purpose:    accessor for the magpie error variable
249: \*=======================================================================*/
250: function magpie_error ($errormsg="") {
251:     global $MAGPIE_ERROR;
252: 
253:     if ( isset($errormsg) and $errormsg ) {
254:         $MAGPIE_ERROR = $errormsg;
255:     }
256: 
257:     return $MAGPIE_ERROR;
258: }
259: 
260: /*=======================================================================*\
261:     Function:   _fetch_remote_file
262:     Purpose:    retrieve an arbitrary remote file
263:     Input:      url of the remote file
264:                 headers to send along with the request (optional)
265:     Output:     an HTTP response object (see Snoopy.class.inc)
266: \*=======================================================================*/
267: function _fetch_remote_file ($url, $headers = "" ) {
268:     // Snoopy is an HTTP client in PHP
269:     $client = new Snoopy();
270:     $client->agent = MAGPIE_USER_AGENT;
271:     $client->read_timeout = MAGPIE_FETCH_TIME_OUT;
272:     $client->use_gzip = MAGPIE_USE_GZIP;
273:     if (is_array($headers) ) {
274:         $client->rawheaders = $headers;
275:     }
276: 
277:     @$client->fetch($url);
278:     return $client;
279: 
280: }
281: 
282: /*=======================================================================*\
283:     Function:   _response_to_rss
284:     Purpose:    parse an HTTP response object into an RSS object
285:     Input:      an HTTP response object (see Snoopy)
286:     Output:     parsed RSS object (see rss_parse)
287: \*=======================================================================*/
288: function _response_to_rss ($resp) {
289:     $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
290: 
291:     // if RSS parsed successfully
292:     if ( $rss and !$rss->ERROR) {
293: 
294:         // find Etag, and Last-Modified
295:         foreach($resp->headers as $h) {
296:             // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1"
297:             if (strpos($h, ": ")) {
298:                 list($field, $val) = explode(": ", $h, 2);
299:             }
300:             else {
301:                 $field = $h;
302:                 $val = "";
303:             }
304: 
305:             if ( $field == 'ETag' ) {
306:                 $rss->etag = $val;
307:             }
308: 
309:             if ( $field == 'Last-Modified' ) {
310:                 $rss->last_modified = $val;
311:             }
312:         }
313: 
314:         return $rss;
315:     } // else construct error message
316:     else {
317:         $errormsg = "Failed to parse RSS file.";
318: 
319:         if ($rss) {
320:             $errormsg .= " (" . $rss->ERROR . ")";
321:         }
322:         error($errormsg);
323: 
324:         return false;
325:     } // end if ($rss and !$rss->error)
326: }
327: 
328: /*=======================================================================*\
329:     Function:   init
330:     Purpose:    setup constants with default values
331:                 check for user overrides
332: \*=======================================================================*/
333: function init () {
334:     if ( defined('MAGPIE_INITALIZED') ) {
335:         return;
336:     }
337:     else {
338:         define('MAGPIE_INITALIZED', true);
339:     }
340: 
341:     if ( !defined('MAGPIE_CACHE_ON') ) {
342:         define('MAGPIE_CACHE_ON', true);
343:     }
344: 
345:     if ( !defined('MAGPIE_CACHE_DIR') ) {
346:         define('MAGPIE_CACHE_DIR', './cache');
347:     }
348: 
349:     if ( !defined('MAGPIE_CACHE_AGE') ) {
350:         define('MAGPIE_CACHE_AGE', 60*60); // one hour
351:     }
352: 
353:     if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) {
354:         define('MAGPIE_CACHE_FRESH_ONLY', false);
355:     }
356: 
357:     if ( !defined('MAGPIE_OUTPUT_ENCODING') ) {
358:         define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1');
359:     }
360: 
361:     if ( !defined('MAGPIE_INPUT_ENCODING') ) {
362:         define('MAGPIE_INPUT_ENCODING', null);
363:     }
364: 
365:     if ( !defined('MAGPIE_DETECT_ENCODING') ) {
366:         define('MAGPIE_DETECT_ENCODING', true);
367:     }
368: 
369:     if ( !defined('MAGPIE_DEBUG') ) {
370:         define('MAGPIE_DEBUG', 0);
371:     }
372: 
373:     if ( !defined('MAGPIE_USER_AGENT') ) {
374:         $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net';
375: 
376:         if ( MAGPIE_CACHE_ON ) {
377:             $ua = $ua . ')';
378:         }
379:         else {
380:             $ua = $ua . '; No cache)';
381:         }
382: 
383:         define('MAGPIE_USER_AGENT', $ua);
384:     }
385: 
386:     if ( !defined('MAGPIE_FETCH_TIME_OUT') ) {
387:         define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout
388:     }
389: 
390:     // use gzip encoding to fetch rss files if supported?
391:     if ( !defined('MAGPIE_USE_GZIP') ) {
392:         define('MAGPIE_USE_GZIP', true);
393:     }
394: }
395: 
396: // NOTE: the following code should really be in Snoopy, or at least
397: // somewhere other then rss_fetch!
398: 
399: /*=======================================================================*\
400:     HTTP STATUS CODE PREDICATES
401:     These functions attempt to classify an HTTP status code
402:     based on RFC 2616 and RFC 2518.
403: 
404:     All of them take an HTTP status code as input, and return true or false
405: 
406:     All this code is adapted from LWP's HTTP::Status.
407: \*=======================================================================*/
408: 
409: 
410: /*=======================================================================*\
411:     Function:   is_info
412:     Purpose:    return true if Informational status code
413: \*=======================================================================*/
414: function is_info ($sc) {
415:     return $sc >= 100 && $sc < 200;
416: }
417: 
418: /*=======================================================================*\
419:     Function:   is_success
420:     Purpose:    return true if Successful status code
421: \*=======================================================================*/
422: function is_success ($sc) {
423:     return $sc >= 200 && $sc < 300;
424: }
425: 
426: /*=======================================================================*\
427:     Function:   is_redirect
428:     Purpose:    return true if Redirection status code
429: \*=======================================================================*/
430: function is_redirect ($sc) {
431:     return $sc >= 300 && $sc < 400;
432: }
433: 
434: /*=======================================================================*\
435:     Function:   is_error
436:     Purpose:    return true if Error status code
437: \*=======================================================================*/
438: function is_error ($sc) {
439:     return $sc >= 400 && $sc < 600;
440: }
441: 
442: /*=======================================================================*\
443:     Function:   is_client_error
444:     Purpose:    return true if Error status code, and its a client error
445: \*=======================================================================*/
446: function is_client_error ($sc) {
447:     return $sc >= 400 && $sc < 500;
448: }
449: 
450: /*=======================================================================*\
451:     Function:   is_client_error
452:     Purpose:    return true if Error status code, and its a server error
453: \*=======================================================================*/
454: function is_server_error ($sc) {
455:     return $sc >= 500 && $sc < 600;
456: }
457: 
458: ?>