Simplified regex, and follows link tags, which'll grab RSS links and other links. I edited it because my site uses buttons to navigate between pages (JS) rather than <a> links, but I include a two links in the head which point to the next and previous page of a document.
Even if you remove the link part, you can get rid of all the ridiculous [Aa] stuff, the /i switch will work just fine.
--- Crawler.class.php 2006-01-29 21:19:00.000000000 -0600
+++ ../../../inc/classes/Crawler.class.php 2008-07-02 17:39:09.000000000 -0500
@@ -207,17 +207,16 @@
}
// contribution by vvkov
-// preg_match_all("/<[Aa][ \r\n\t]{1}[^>]*[Hh][Rr][Ee][Ff][^=]*=[ '\"\n\r\t]*([^ \"'>]+)[^>]*>/",$res ,$urls);
- preg_match_all("/<[Aa][^>]*[Hh][Rr][Ee][Ff]=['\"]([^\"'>]+)[^>]*>/",$res ,$urls); // update by TK, 2005-07-27
- $urls_count = count( $urls[1] );
-
+ preg_match_all("/<(a|link)[^>]*href=['\"]([^\"'>]+)[^>]*>/i",$res ,$urls); // update by TK, 2005-07-27
+ $urls_count = count( $urls[2] );
+
if (preg_match("/<[Bb][Aa][Ss][Ee][^>]*[Hh][Rr][Ee][Ff]=['\"]([^\"'>]+)[^>]*>/", $res, $matches)) {
$this->base = $matches[1];
}
$ts_begin = $this->microtime_float();
while ((($ts_middle = ($this->microtime_float()-$ts_begin)) < PSNG_CRAWLER_MAX_GETFILE_TIME) && $urls_count > 0 ) {
- $thisurl = trim(str_replace('&', '&', $urls[1][--$urls_count]));
+ $thisurl = trim(str_replace('&', '&', $urls[2][--$urls_count]));
if ($thisurl == '' || (strcasecmp(substr($thisurl, 0, strlen('javascript:')), 'javascript:') == 0)) continue;
// filter out links to fragment ids (same resource) - added mk/2005-11-13
if ('#' == $thisurl{0}) continue;