PHP – Class Simple Page Crawler

Here is an example of simple page crawler using PHP.

Code Example: Download Example
index.php
<?php

require_once 'simple_crawler.class.php';

$reader = new HtmlReader();

$page = 'http://example.com';

//read  content from url
$html = $reader->getPageContent($page);

//document content object
$htmlDoc = new HtmlDocument($html);

//document body part object
$body = $htmlDoc->getBody();

//objects array of page links
$links = $body->grabLinks();

//clean text version of document body object
$cleanBody = $body->getStrippedBody();

//counted words from cleaned document body (word=>count)
$words = new BodyWords();
$pageWords = $words->findWords($cleanBody->getContent());
$words->appendWords($pageWords);

//follow front page links with recursive=1
foreach($links as $link) {
	if($link->url == '/') continue;
	if($link->type == 1) {
		$pageLink = $page.$link->url;
	} else {
		continue; //no follow external links
		//$pageLink = $link->url;
	}
	$html = $reader->getPageContent($pageLink);

	$htmlDoc = new HtmlDocument($html);
	$body = $htmlDoc->getBody();
	$cleanBody = $body->getStrippedBody();

	$pageWords = $words->findWords($cleanBody->getContent());
	$words->appendWords($pageWords, $link->url);
}

//display words:count per page
echo "<pre>";
print_r($words->getWords());
//here you may do something with this words

?>
simple_crawler.class.php
<?php
/**
 * Base crawler class
 */
class CrawlerBase {

	/**
	 * remove specified $tag form $content
	 */
	private function removeTag($tag, $content) {
		//$pat = '/<'.$tag.'.*>.*<\/'.$tag.'>/s';
		$pat = '@<'.$tag.'[^>]*?.*?</'.$tag.'>@siu';
		return preg_replace($pat, '',  $content);
	}

	/**
	 * remove unwanted tags from content
	 */
	public function cleanContent($content) {

		$content = $this->removeTag('script', $content);
		$content = $this->removeTag('css', $content);
		$content = $this->removeTag('object', $content);
		return $content;
	}

}

/**
 * Read html page content
 */
class HtmlReader extends CrawlerBase{

	public function getPageContent($url) {
		return file_get_contents($url);
	}
}

/**
 * class for html document
 */
class HtmlDocument extends CrawlerBase {
	/**
	 * Document content
	 */
	private  $content;

	public function __construct($content) {
		$this->content = $content;
	}

	public function getBody() {
		return new HtmlDocumentBody($this->content);
	}

	public function getHead() {
		return new HtmlDocumentHead($this->content);
	}

}

/**
 * html Head section class
 */
class HtmlDocumentHead extends CrawlerBase {

	/**
	 * Document content
	 */
	private  $content;

	public function __construct($htmlContent = null) {
		if(!empty($htmlContent)) {
			$this->findDocumentHead($htmlContent);
		}
	}

	/**
	 * find document <head> part
	 */
	public function findDocumentHead($htmlContent) {
		$matches = array();
		preg_match('/(<head>)(.*)(<\/head>)/si', $htmlContent, $matches);
		$this->content =  $matches[2];
	}

}

/**
 * html body section class
 */
class HtmlDocumentBody extends CrawlerBase  {

	/**
	 * Document content
	 */
	private $content;

	public function __construct($htmlContent = null) {
		$this->findDocumentBody($htmlContent);
	}

	/**
	 * read links from content
	 */
	public function grabLinks() {

		$links = array();
		$matches = array();
		$regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";
		preg_match_all("/$regexp/siU", $this->content, $matches, PREG_SET_ORDER);
		if(!empty($matches)) {
			foreach ($matches as $link) {
				$links[] = new ContentLink($link);
			}
		}
		return 	$links;
	}

	/**
	 * find document <body> part
	 */
	public function findDocumentBody($content) {
		$matches = array();
		preg_match('/(<body>)(.*)(<\/body>)/is', $content, $matches);
		$this->content =  $matches[2];
	}

	public function getDocumentBody() {
		return $this->content;
	}

	public function getStrippedBody() {
		return new StrippedBody($this->content);
	}

}

/**
 * cleaned body (with no tags)
 */
class StrippedBody extends CrawlerBase {

	/**
	 * Document content
	 */
	private  $content;

	public function __construct($UnsrtippedBodyContent = null) {
		if(!empty($UnsrtippedBodyContent)) {
			$UnsrtippedBodyContent = $this->cleanContent($UnsrtippedBodyContent);

			$this->content = strip_tags($UnsrtippedBodyContent);
		}
	}

	public function getContent() {
		return $this->content;
	}
}

/**
 * link class
 */
class ContentLink extends CrawlerBase {
	/**
	 * Document content
	 */
	public $fullUrl;
	public $url;
	public $anchor;
	public $type;

	public function __construct(array $linkData) {
		$this->parseLinkData($linkData);
	}

	public function parseLinkData(array $linkData) {
		$this->fullUrl = $linkData[0];
		$this->url = $linkData[2];
		$this->anchor = strip_tags($linkData[3]);
		if(preg_match('/^http/', $linkData[2])) {
			$this->type = 2; //external
		} else {
			$this->type = 1; //local
		}
	}
}

/**
 * body words class. find, count, append
 */
class BodyWords extends CrawlerBase {

	/**
	 * Document content
	 */
	private $words = array();

	public function __construct() {

	}

	private function countWords($uncountedWordsArray) {
		$wordsArray = array_count_values($uncountedWordsArray);
		$this->removeShortWords($wordsArray);
		asort($wordsArray);
		return $wordsArray;
	}

	private function removeShortWords(&$countedWordsArray) {
		if(!empty($countedWordsArray)) {
			foreach($countedWordsArray as $word => $count) {
				if(strlen($word) < 4) {
					unset($countedWordsArray[$word]);
				}
			}
		}
	}

	public function findWords($cleanBodyText) {
		$uncountedWordsArray = preg_split("/[\s,.?!]+/", $cleanBodyText);
		return $this->countWords($uncountedWordsArray);
	}

	public function appendWords($wordsArray, $page='/') {
		if(!empty($wordsArray)) {
			foreach ($wordsArray as $word => $count) {
				if(array_key_exists($word, $this->words)) {
					$this->words[$page][$word] = $this->words[$page][$word] + $count;
				} else {
					$this->words[$page][$word] = $count;
				}
			}
		}
	}

	public function getWords() {
		return $this->words;
	}
}
?>

Post to Twitter Post to Digg Post to Facebook Post to Google Buzz Send Gmail

Leave a Comment

Your email address will not be published. Required fields are marked *