| 
<?php
include 'lib/phpQuery-onefile.php';
 /**
 *
 * This class allows you to get data from any site.
 * The data are taken from defined locations in the DOM structure.
 * Data points are defined using the phpquery notation  - similar to the selectors used in JQuery library.
 * This class can fetch data in three different modes by:
 *  - scanning a single page
 *  - scanning a "from->to" range of pages matching defined URL schema
 *  - scanning a list of URLs retrieved from a PHP array
 *
 * @example scrap single page
 * $scrap = new Scraper();
 * $scrap->setBaseUrl('http://page.to.scrap/index.html');
 * $scrap->addDataTarget('title', '#product h1');
 * $data = $scrap->process();
 *
 * @example scrap range of pages
 * $scrap = new Scraper();
 * $scrap->setBaseUrl('http://example.url.com/details.html?id=##TOKEN##');
 * $scrap->addRangeScanRule(151598039, 151598042, '##TOKEN##');
 * $scrap->addDataTarget('name', '#head1 .title h1');
 * $data = $scrap->process();
 *
 * @example scrap list of custom urls
 * $scrap = new Scraper();
 * $myUrls = array('http://site.ccm/ulr1/', 'http://site.ccm/ulr2/', 'http://site.ccm/ulr3/');
 * $scrap->addListScanRule($myUrls);
 * $scrap->addDataTarget('title', '#content .ogloszenie_item h1');
 * $scrap->addDataTarget('image', '#content .ogloszenie_thumb a');
 * $scrap->addDataTarget('price', '#content .ogloszenie_item:contains(\'Cena:\')');
 * $data = $scrap->process();
 *
 * Downloaded data is returned as the array.
 * You can do whatever you want with the data;)
 *
 * @package Scraper
 * @see This class  uses phpquery library
 * @link http://code.google.com/p/phpquery/
 *
 * @author JLukasiewicz jlukasie at gmail
 *
 */
 class Scraper
 {
 /**
 *
 * base url to range/single -scan
 * @var string
 */
 private $baseUrl = '';
 
 /**
 *
 * scan rule
 * Can be 'range' or 'list' type
 * @var array
 */
 private $scanRule = array();
 
 /**
 *
 * Data points in phpquery notation
 * @var array
 */
 private $dataTargets = array();
 
 
 public function __construct()
 {
 
 }
 
 /**
 *
 *  baseUrl setter
 * @param string $url
 * @throws Exception
 */
 public function setBaseUrl($url)
 {
 if (empty($url))
 {
 throw new Exception('Value not specified: url', 1);
 }
 
 $this->baseUrl = $url;
 }
 
 /**
 *
 * scanRule setter
 * @param string $type
 * @param mixed $value
 * @throws Exception
 */
 private function setScanRule($type, $value)
 {
 
 if (empty($type) || empty($value))
 {
 throw new Exception('Value not specified: type or value', 1);
 }
 
 $this->scanRule[$type] = $value;
 }
 
 
 /**
 *
 * add range rule
 * @param int $min
 * @param int $max
 * @param string $token
 * @throws Exception
 */
 public function addRangeScanRule($min, $max, $token)
 {
 if (empty($min) || empty($max))
 {
 throw new Exception('Value not specified: min or max', 1);
 }
 
 $this->setScanRule('range', array('min' => $min, 'max' => $max, 'token' => $token) );
 }
 
 /**
 *
 * add list scan rule
 * @param array $list
 * @throws Exception
 */
 public function addListScanRule($list)
 {
 if (empty($list) || !is_array($list))
 {
 throw new Exception('address list is not specified', 1);
 }
 
 $this->setScanRule('list', $list);
 }
 
 /**
 *
 * add data point
 * @param string $name
 * @param string $selector
 * @throws Exception
 */
 public function addDataTarget($name, $selector)
 {
 if (empty($name) || empty($selector))
 {
 throw new Exception('Value not specified: name or selector', 1);
 }
 
 $this->dataTargets[$name] = $selector;
 }
 
 /**
 *
 * perform scan
 */
 public function process()
 {
 $data = array();
 
 $urls = $this->getUrlsToScan();
 
 foreach ($urls as $url)
 {
 if(!($input = @file_get_contents($url)))
 {
 continue;
 }
 phpQuery::newDocumentFileHTML($url);
 unset($scrap);
 foreach ($this->dataTargets as $name => $selector)
 {
 $scrap[$name] = pq($selector)->html();
 }
 if(!empty($scrap))
 {
 $data[] = $scrap;
 }
 }
 
 return $data;
 }
 
 
 /**
 *
 * construct url list to scan
 * @throws Exception
 */
 private function getUrlsToScan()
 {
 
 $urls = array();
 
 if (!empty($this->scanRule))
 {
 if(!empty($this->scanRule['range']))
 {
 if (empty($this->baseUrl))
 {
 throw new Exception('baseUrl not specified', 2);
 }
 if(!empty($this->scanRule['range']['min']) && !empty($this->scanRule['range']['max']))
 {
 for($i = $this->scanRule['range']['min']; $i <= $this->scanRule['range']['max']; $i++)
 {
 $urls[] = str_replace($this->scanRule['range']['token'], $i, $this->baseUrl);
 }
 }
 else
 {
 throw new Exception('scanRule invalid format', 3);
 }
 }
 elseif (!empty($this->scanRule['list']))
 {
 $urls = $this->scanRule['list'];
 }
 }
 else
 {
 $urls = array($this->baseUrl);
 }
 return $urls;
 }
 }
 |