| 
<?php
/*****/
 /*
 Written by: Aziz S. Hussain
 Email: [email protected]
 Website: www.azizsaleh.com
 Produced under GPL License
 */
 /*****/
 
 
 /*
 Email address scraper based on a URL.
 */
 
 class scraper
 {
 // URL that stores first URL to start
 var $startURL;
 
 // List of allowed page extensions
 var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv'
 ,'.avi','.mp3','.flash','.swf','.css');
 
 // Which URL to scrape
 var $useURL;
 
 // Start path, for links that are relative
 var $startPath;
 
 // Set start path
 function setStartPath($path = NULL){
 if($path != NULL)
 {
 $this->startPath = $path;
 } else {
 $temp = explode('/',$this->startURL);
 $this->startPath = $temp[0].'//'.$temp[2];
 }
 }
 
 // Add the start URL
 function startURL($theURL){
 // Set start URL
 $this->startURL = $theURL;
 }
 
 // Function to get URL contents
 function getContents($url)
 {
 $ch = curl_init(); // initialize curl handle
 curl_setopt($ch, CURLOPT_HEADER, 0);
 curl_setopt($ch, CURLOPT_VERBOSE, 0);
 curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
 curl_setopt($ch, CURLOPT_AUTOREFERER, false);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
 curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
 curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
 curl_setopt($ch, CURLOPT_FAILONERROR, 1);
 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);// allow redirects
 curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
 curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
 curl_setopt($ch, CURLOPT_POST, 0); // set POST method
 $buffer = curl_exec($ch); // run the whole process
 curl_close($ch);
 return $buffer;
 }
 
 // Actually do the URLS
 function startScraping()
 {
 // Get page content
 $pageContent = $this->getContents($this->startURL);
 echo 'Scraping URL: '.$this->startURL.PHP_EOL;
 
 // Get list of all emails on page
 preg_match_all('/([\w+\.]*\w+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);
 // Add the email to the email list array
 $insertCount=0;
 foreach($results[1] as $curEmail)
 {
 $insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
 if($insert){$insertCount++;}
 }
 
 echo 'Emails found: '.number_format($insertCount).PHP_EOL;
 
 // Mark the page done
 $insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')");
 
 // Get list of new page URLS is emails were found on previous page
 preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
 $currentList = $this->cleanListURLs($results[1]);
 
 $insertURLCount=0;
 // Add the list to the array
 foreach($currentList as $curURL)
 {
 $insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
 if($insert){$insertURLCount++;}
 }
 
 echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;
 
 $getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
 $remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1");
 
 // Get the new page ready
 $this->startURL = $getURL['urlname'];
 $this->setStartPath();
 
 // If no more pages, return
 if($this->startURL == NULL){ return;}
 // Clean vars
 unset($results,$pageContent);
 // If more pages, loop again
 $this->startScraping();
 }
 
 // Function to clean input URLS
 function cleanListURLs($linkList)
 {
 foreach($linkList as $sub => $url)
 {
 // Check if only 1 character - there must exist at least / character
 if(strlen($url) <= 1){unset($linkList[$sub]);}
 // Check for any javascript
 if(eregi('javascript',$url)){unset($linkList[$sub]);}
 // Check for invalid extensions
 str_replace($this->allowedExtensions,'',$url,$count);
 if($count > 0){ unset($linkList[$sub]);}
 // If URL starts with #, ignore
 if(substr($url,0,1) == '#'){unset($linkList[$sub]);}
 
 // If everything is OK and path is relative, add starting path
 if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
 $linkList[$sub] = $this->startPath.$url;
 }
 }
 return $linkList;
 }
 
 }
 ?>
 |