@dani
I got ChatGpt to fix my Crawler that was showing error.
This code that was showing error:
My Buggy Code
<?php
//START OF SCRIPT FLOW.
//Preparing Crawler & Session: Initialising Variables.
//Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
//SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; //This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; //This will list SiteMap pages priorities - found on Sitemaps.
//Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls = []; //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; //This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; //This will list html pages priorities - found on Sitemaps.
//Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
//Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names = []; //This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; //This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles = []; //This will list crawled pages Titles - found on html pages.
// -----
//Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point.
//Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml SItemap).
//$initial_url = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";
//$xmls = file_get_contents($initial_url); //Should I stick to this line or below line ?
//Parse the sitemap content to object
//$xml = simplexml_load_string($xmls); //Should I stick to this line or above line ?
$xml = simplexml_load_string(file_get_contents($initial_url)); //Code from Dani: https://www.daniweb.com/programming/web-development/threads/540168/what-to-lookout-for-to-prevent-crawler-traps
$dom = new DOMDocument();
$dom->loadXML($xml); //LINE: 44z
//$result = @$dom->loadXML($xml); //LINE: 44
echo __LINE__; echo '<br>'; //LINE: 46
extract_links($xml);
echo __LINE__; echo '<br>'; //LINE: 50
foreach($sitemaps AS $sitemap)
{
echo __LINE__; echo '<br>';
extract_links($sitemap); //Extract Links on page.
}
foreach($html_page_urls AS $html_page_url)
{
echo __LINE__; echo '<br>';
$scrape_page_data($html_page_url); //Extract Links on page.
}
//END OF SCRIPT FLOW.
//DUNCTIONS BEYOND THIS POINT.
//Links Extractor.
function extract_links()
{
echo __LINE__; echo '<br>'; //LINE: 73
GLOBAL $dom;
//Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.).
if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
{
echo __LINE__; echo '<br>';
//parse the index
// retrieve properties from the sitemap object
foreach ($xml->sitemapindex as $urlElement) //Extracts xml file urls.
{
// get properties
$sitemaps[] = $sitemap_url = $urlElement->loc;
$sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
$sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
$sitemaps_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $sitemap_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
{
echo __LINE__; echo '<br>';
//parse url set
// retrieve properties from the sitemap object
foreach ($xml->urlset as $urlElement) //Extracts Sitemap Urls.
{
// get properties
$html_page_urls[] = $html_page_url = $urlElement->loc;
$html_page_last_mods[] = $last_mod = $urlElement->lastmod;
$html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
$html_page_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $html_page_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
GLOBAL $sitemaps;
GLOBAL $sitemaps_last_mods;
GLOBAL $sitemaps_change_freqs;
GLOBAL $sitemaps_priorities;
GLOBAL $html_page_urls;
GLOBAL $html_page_last_mods;
GLOBAL $html_page_change_freqs;
GLOBAL $html_page_priorities;
echo 'SiteMaps Crawled: ---'; echo '<br><br>';
if(array_count_values($sitemaps)>0)
{
print_r($sitemaps);
echo '<br>';
}
elseif(array_count_values($sitemaps_last_mods)>0)
{
print_r($sitemaps_last_mods);
echo '<br>';
}
elseif(array_count_values($sitemaps_change_freqs)>0)
{
print_r($sitemaps_change_freqs);
echo '<br>';
}
elseif(array_count_values($sitemaps_priorities)>0)
{
print_r($sitemaps_priorities);
echo '<br><br>';
}
echo 'Html Pages Crawled: ---'; echo '<br><br>';
if(array_count_values($html_page_urls)>0)
{
print_r($html_page_urls);
echo '<br>';
}
if(array_count_values($html_page_last_mods)>0)
{
print_r($html_page_last_mods);
echo '<br>';
}
if(array_count_values($html_page_change_freqs)>0)
{
print_r($html_page_change_freqs);
echo '<br>';
}
if(array_count_values($html_page_priorities)>0)
{
print_r($html_page_priorities);
echo '<br>';
}
}
//Meta Data & Title Extractor.
function scrape_page_data()
{
GLOBAL $html_page_urls;
if(array_count_values($html_page_urls)>0)
{
foreach($html_page_urls AS $url)
{
// https://www.php.net/manual/en/function.file-get-contents
$html = file_get_contents($url);
//https://www.php.net/manual/en/domdocument.construct.php
$doc = new DOMDocument();
// https://www.php.net/manual/en/function.libxml-use-internal-errors.php
libxml_use_internal_errors(true);
// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);
// https://www.php.net/manual/en/function.libxml-clear-errors.php
libxml_clear_errors();
// https://www.php.net/manual/en/domdocument.getelementsbytagname.php
$meta_tags = $doc->getElementsByTagName('meta');
// https://www.php.net/manual/en/domnodelist.item.php
if ($meta_tags->length > 0)
{
// https://www.php.net/manual/en/class.domnodelist.php
foreach ($meta_tags as $tag)
{
// https://www.php.net/manual/en/domnodelist.item.php
echo 'Meta Name: ' .$meta_name = $tag->getAttribute('name'); echo '<br>';
echo 'Meta Content: ' .$meta_content = $tag->getAttribute('content'); echo '<br>';
$html_page_meta_names[] = $meta_name;
$html_page_meta_descriptions[] = $meta_content;
}
}
//EXAMPLE 1: Extract Title
$title_tag = $doc->getElementsByTagName('title');
if ($title_tag->length>0)
{
echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>';
$html_page_titles[] = $title;
}
//EXAMPLE 2: Extract Title
$title_tag = $doc->getElementsByTagName('title');
for ($i = 0; $i < $title_tag->length; $i++) {
echo 'Title: ' .$title = $title_tag->item($i)->nodeValue . "\n";
$html_page_titles[] = $title;
}
}
}
}
if(array_count_values($html_page_meta_names)>0)
{
print_r($html_page_meta_names);
echo '<br>';
}
if(array_count_values($html_page_meta_descriptions)>0)
{
print_r($html_page_meta_descriptions);
echo '<br>';
}
if(array_count_values($html_page_titles)>0)
{
print_r($html_page_titles);
echo '<br>';
}
//END OF FUNCTIONS.
ChatGpt fixed it to the following. Do let me know if the code is ok or not. It is working.
Crawler v1
<?php
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);
// Preparing Crawler & Session: Initializing Variables.
// Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
// SiteMaps Details Scraped from SiteMaps or Xml Files.
$sitemaps = []; // This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; // This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; // This will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; // This will list SiteMap pages priorities - found on Sitemaps.
// Webpage Details Scraped from SiteMaps or Xml Files.
$html_page_urls = []; // This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; // This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; // This will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; // This will list html pages priorities - found on Sitemaps.
// Step 1: Initiate Session - Feed Xml SiteMap URL. Crawling Starting Point.
$initial_url = "http://localhost/Work/buzz/Templates/0.xml";
$xml = simplexml_load_file($initial_url);
$dom = new DOMDocument();
$dom->loadXML($xml->asXML());
echo __LINE__ . '<br>';
crawl_sitemaps($xml);
foreach ($html_page_urls as $html_page_url) {
echo __LINE__ . '<br>';
scrape_page_data($html_page_url); // Extract Meta Data and Title from HTML page.
}
// END OF SCRIPT FLOW.
// FUNCTIONS BEYOND THIS POINT.
// Crawl SiteMaps.
function crawl_sitemaps($xml)
{
global $sitemaps;
global $html_page_urls;
if ($xml->getName() === 'sitemapindex') {
foreach ($xml->sitemap as $urlElement) {
$sitemaps[] = $sitemap_url = (string)$urlElement->loc;
$sitemaps_last_mods[] = $last_mod = (string)$urlElement->lastmod;
$sitemaps_change_freqs[] = $change_freq = (string)$urlElement->changefreq;
$sitemaps_priorities[] = $priority = (string)$urlElement->priority;
echo 'sitemap_url: ' . $sitemap_url . '<br>';
echo 'last_mod: ' . $last_mod . '<br>';
echo 'change_freq: ' . $change_freq . '<br>';
echo 'priority: ' . $priority . '<br>';
echo '<br>---<br>';
$sitemap_xml = simplexml_load_file($sitemap_url);
crawl_sitemaps($sitemap_xml); // Recursively crawl nested sitemaps.
}
} elseif ($xml->getName() === 'urlset') {
foreach ($xml->url as $urlElement) {
$html_page_urls[] = $html_page_url = (string)$urlElement->loc;
$html_page_last_mods[] = $last_mod = (string)$urlElement->lastmod;
$html_page_change_freqs[] = $change_freq = (string)$urlElement->changefreq;
$html_page_priorities[] = $priority = (string)$urlElement->priority;
echo 'html_page_url: ' . $html_page_url . '<br>';
echo 'last_mod: ' . $last_mod . '<br>';
echo 'change_freq: ' . $change_freq . '<br>';
echo 'priority: ' . $priority . '<br>';
echo '<br>---<br>';
}
}
echo 'SiteMaps Crawled: ---<br><br>';
print_r($sitemaps);
echo '<br><br>';
echo 'HTML Pages Crawled: ---<br><br>';
print_r($html_page_urls);
echo '<br><br>';
}
// Meta Data & Title Extractor.
function scrape_page_data($html_page_url)
{
$html = file_get_contents($html_page_url);
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($html, LIBXML_COMPACT | LIBXML_NOERROR | LIBXML_NOWARNING);
libxml_clear_errors();
$meta_tags = $doc->getElementsByTagName('meta');
if ($meta_tags->length > 0) {
foreach ($meta_tags as $tag) {
echo 'Meta Name: ' . $meta_name = $tag->getAttribute('name') . '<br>';
echo 'Meta Content: ' . $meta_content = $tag->getAttribute('content') . '<br>';
}
}
$title_tag = $doc->getElementsByTagName('title');
if ($title_tag->length > 0) {
echo 'Title: ' . $title = $title_tag[0]->textContent . '<br>';
}
}
?>