@reverend_jim
How many programming langs you know ?
@reverend_jim
How many programming langs you know ?
@rprofitt
I get no urls crawled or extracted ? Get no errors, either. Strange! I just get this echoed:
**SiteMaps Crawled: ---
Array ( )
Html Pages Crawled: ---
Array ( )
Array ( )
Array ( )
Array ( )
**
As you can see, the starting point link does have urls on it's pages:
<?php
ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(E_ALL);
//Preparing Crawler & Session: Initialising Variables.
//Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
//Data Scraped from SiteMaps or Xml Files.
$sitemaps = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; //This will list dates of SiteMap pages last modified - found on Sitemap.
$sitemaps_change_freqs = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; //This will list SiteMap pages priorities - found on Sitemaps.
//Data Scraped from SiteMaps or Xml Files.
$html_page_urls = array(); //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; //This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; //This will list html pages priorities - found on Sitemaps.
//Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
//Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_titles = []; //This will list crawled pages Titles - found on html pages.
$html_page_meta_names = []; //This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; //This will list crawled pages Meta Tag Descriptions - found on html pages.
// -----
//Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point.
//Crawl Session Starting Page/Initial Xml Sitemap.
$sitemap = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
$xml = file_get_contents($sitemap); //Should I stick to this line or below line ?
// parse the sitemap content to object
//$xml = simplexml_load_string($sitemap); //Should I stick to this line or above line ?
$dom = new DOMDocument();
$dom->loadXML($xml);
//Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.).
if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
{
//parse the index
// retrieve properties from the sitemap object
foreach ($xml->urlset as $urlElement) //Extracts html file urls.
{
// get properties
$sitemaps[] = $sitemap_url = $urlElement->loc;
$sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
$sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
$sitemaps_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $sitemap_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
{
//parse url set
// retrieve properties from the sitemap object
foreach ($xml->sitemapindex as $urlElement) //Extracts Sitemap Urls.
{
// get properties
$html_page_urls[] = $html_page_url = $urlElement->loc;
$html_page_last_mods[] = $last_mod = $urlElement->lastmod;
$html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
$html_page_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $html_page_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else
{
//Scrape Webpage Data as current page is an hmtl page for visitors and no Xml SiteMap page for Crawlers.
//scrape_page_data(); //Scrape Page Title & Meta Tags.
}
echo 'SiteMaps Crawled: ---';echo '<br><br>';
if(array_count_values($html_page_urls)>0)
{
print_r($sitemaps);
echo '<br>';
}
elseif(array_count_values($sitemaps_last_mods)>0)
{
print_r($sitemaps_last_mods);
echo '<br>';
}
elseif(array_count_values($sitemaps_change_freqs)>0)
{
print_r($sitemaps_change_freqs);
echo '<br>';
}
elseif(array_count_values($sitemaps_priorities)>0)
{
print_r($sitemaps_priorities);
echo '<br><br>';
}
echo 'Html Pages Crawled: ---'; echo '<br><br>';
if(array_count_values($html_page_urls)>0)
{
print_r($html_page_urls);
echo '<br>';
}
if(array_count_values($html_page_last_mods)>0)
{
print_r($html_page_last_mods);
echo '<br>';
}
if(array_count_values($html_page_change_freqs)>0)
{
print_r($html_page_change_freqs);
echo '<br>';
}
if(array_count_values($html_page_priorities)>0)
{
print_r($html_page_priorities);
echo '<br>';
}
scrape_page_data(); //Scrape Page Title & Meta Tags.
function scrape_page_data()
{
GLOBAL $html_page_urls;
if(array_count_values($html_page_urls)>0)
{
foreach($html_page_urls AS $url)
{
//Extract Page's Meta Data & Title.
file_get_contents($url);
// https://www.php.net/manual/en/function.file-get-contents
$html = file_get_contents($url);
//https://www.php.net/manual/en/domdocument.construct.php
$doc = new DOMDocument();
// https://www.php.net/manual/en/function.libxml-use-internal-errors.php
libxml_use_internal_errors(true);
// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);
// https://www.php.net/manual/en/function.libxml-clear-errors.php
libxml_clear_errors();
// https://www.php.net/manual/en/domdocument.getelementsbytagname.php
$meta_tags = $doc->getElementsByTagName('meta');
// https://www.php.net/manual/en/domnodelist.item.php
if ($meta_tags->length > 0)
{
// https://www.php.net/manual/en/class.domnodelist.php
foreach ($meta_tags as $tag)
{
// https://www.php.net/manual/en/domnodelist.item.php
echo 'Name: ' .$name = $tag->getAttribute('name'); echo '<br>';
echo 'Content: ' .$content = $tag->getAttribute('content'); echo '<br>';
}
}
//EXAMPLE 1: Extract Title
$title_tag = $doc->getElementsByTagName('title');
if ($title_tag->length>0)
{
echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>';
}
//EXAMPLE 2: Extract Title
$title_tag = $doc->getElementsByTagName('title');
for ($i = 0; $i < $title_tag->length; $i++) {
echo $title_tag->item($i)->nodeValue . "\n";
}
}
}
}
?>
That is my latest update. What do you think about it and why you think I getting echoed no links ?
@dani
Can the below code get any shorter or not so I can easily spot where the issue is as I get no error and no proper result.
Just get echoed:
**SiteMaps Crawled: ---
Array ( )
Html Pages Crawled: ---
Array ( )
Array ( )
Array ( )
Array ( ) **
<?php
ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(E_ALL);
//Preparing Crawler & Session: Initialising Variables.
//Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
//Data Scraped from SiteMaps or Xml Files.
$sitemaps = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; //This will list dates of SiteMap pages last modified - found on Sitemap.
$sitemaps_change_freqs = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; //This will list SiteMap pages priorities - found on Sitemaps.
//Data Scraped from SiteMaps or Xml Files.
$html_page_urls = array(); //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; //This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; //This will list html pages priorities - found on Sitemaps.
//Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
//Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_titles = []; //This will list crawled pages Titles - found on html pages.
$html_page_meta_names = []; //This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; //This will list crawled pages Meta Tag Descriptions - found on html pages.
// -----
//Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point.
//Crawl Session Starting Page/Initial Xml Sitemap.
$sitemap = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
$xml = file_get_contents($sitemap); //Should I stick to this line or below line ?
// parse the sitemap content to object
//$xml = simplexml_load_string($sitemap); //Should I stick to this line or above line ?
$dom = new DOMDocument();
$dom->loadXML($xml);
extract_links();
function extract_links()
{
GLOBAL $dom;
//Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.).
if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
{
//parse the index
// retrieve properties from the sitemap object
foreach ($xml->urlset as $urlElement) //Extracts html file urls.
{
// get properties
$sitemaps[] = $sitemap_url = $urlElement->loc;
$sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
$sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
$sitemaps_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $sitemap_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
{
//parse url set
// retrieve properties from the sitemap object
foreach ($xml->sitemapindex as $urlElement) //Extracts Sitemap Urls.
{
// get properties
$html_page_urls[] = $html_page_url = $urlElement->loc;
$html_page_last_mods[] = $last_mod = $urlElement->lastmod;
$html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
$html_page_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $html_page_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else
{
//Scrape Webpage Data as current page is an hmtl page for visitors and no Xml SiteMap page for Crawlers.
//scrape_page_data(); //Scrape Page Title & Meta Tags.
}
GLOBAL $sitemaps;
GLOBAL $sitemaps_last_mods;
GLOBAL $sitemaps_change_freqs;
GLOBAL $sitemaps_priorities;
GLOBAL $html_page_urls;
GLOBAL $html_page_last_mods;
GLOBAL $html_page_change_freqs;
GLOBAL $html_page_priorities;
echo 'SiteMaps Crawled: ---'; echo '<br><br>';
if(array_count_values($sitemaps)>0)
{
print_r($sitemaps);
echo '<br>';
}
elseif(array_count_values($sitemaps_last_mods)>0)
{
print_r($sitemaps_last_mods);
echo '<br>';
}
elseif(array_count_values($sitemaps_change_freqs)>0)
{
print_r($sitemaps_change_freqs);
echo '<br>';
}
elseif(array_count_values($sitemaps_priorities)>0)
{
print_r($sitemaps_priorities);
echo '<br><br>';
}
echo 'Html Pages Crawled: ---'; echo '<br><br>';
if(array_count_values($html_page_urls)>0)
{
print_r($html_page_urls);
echo '<br>';
}
if(array_count_values($html_page_last_mods)>0)
{
print_r($html_page_last_mods);
echo '<br>';
}
if(array_count_values($html_page_change_freqs)>0)
{
print_r($html_page_change_freqs);
echo '<br>';
}
if(array_count_values($html_page_priorities)>0)
{
print_r($html_page_priorities);
echo '<br>';
}
}
foreach($sitemaps AS $sitemap)
{
extract_links();
}
foreach($html_page_urls AS $html_page_url)
{
extract_links();
}
scrape_page_data(); //Scrape Page Title & Meta Tags.
function scrape_page_data()
{
GLOBAL $html_page_urls;
if(array_count_values($html_page_urls)>0)
{
foreach($html_page_urls AS $url)
{
//Extract Page's Meta Data & Title.
file_get_contents($url);
// https://www.php.net/manual/en/function.file-get-contents
$html = file_get_contents($url);
//https://www.php.net/manual/en/domdocument.construct.php
$doc = new DOMDocument();
// https://www.php.net/manual/en/function.libxml-use-internal-errors.php
libxml_use_internal_errors(true);
// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);
// https://www.php.net/manual/en/function.libxml-clear-errors.php
libxml_clear_errors();
// https://www.php.net/manual/en/domdocument.getelementsbytagname.php
$meta_tags = $doc->getElementsByTagName('meta');
// https://www.php.net/manual/en/domnodelist.item.php
if ($meta_tags->length > 0)
{
// https://www.php.net/manual/en/class.domnodelist.php
foreach ($meta_tags as $tag)
{
// https://www.php.net/manual/en/domnodelist.item.php
echo 'Name: ' .$name = $tag->getAttribute('name'); echo '<br>';
echo 'Content: ' .$content = $tag->getAttribute('content'); echo '<br>';
}
}
//EXAMPLE 1: Extract Title
$title_tag = $doc->getElementsByTagName('title');
if ($title_tag->length>0)
{
echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>';
}
//EXAMPLE 2: Extract Title
$title_tag = $doc->getElementsByTagName('title');
for ($i = 0; $i < $title_tag->length; $i++) {
echo $title_tag->item($i)->nodeValue . "\n";
}
}
}
}
?>
Want to see how much you are able to cut it short.
Scratching my head why no link and their meta tags & titles are getting extracted
Thanks
Code should not be measured by how many lines it is or how short it is. Instead, it should be measured by how efficient the code is, how performant it is, how readable it is, and how compartmentalized it is (translating to how easy it is to maintain).
@dani
In that case I did my best to neaten it up. What you think ? Can you argue otherwise ?
I am really really puzzled to why it's not scraping any links.
@rprofitt
Since you are an expert in securities. I guess preventing malicioius injections is your thing too. Am I right ?
Is my crawler code safe or not upto your standard yet ?
I got a worry. Let us see if you can settle it or not.
Crawlers cannot be trapped by hackers on their sites, can they ? I mean, let us say a crook called my crawler to one of his malicious or phishing sites, is he able to trap my crawler and inject virus so my crawler dumps viruses and malicious code onto my searchengine index by the crawler ? Or, worst, can my crawler carry the virus on other sites it crawls and infect them ? Good question. Yes ?
What you think of my code above ? Is it orthodox or weird ? I cannot think of any better basic logics than the ones I used. What you say so far ? It can be improved in terms of security or even efficiency ? I'd like to see how.
@pritaeas
Nearly 2am here and I still cannot figure-out why my crawler fails to extract links, meta data & page titles!
Here is the latest code. Do you see any flaws ? I get no errors.
I only get this echoed.Notice the arrays are empty. It means no data is getting extracted from pages.
**334
361
SiteMaps Crawled: ---
Array ( )
Html Pages Crawled: ---
Array ( )
Array ( )
Array ( )
Array ( )
338
Array ( )
Array ( )
Array ( )**
<?php
ini_set('display_errors',1);
ini_set('display_startup_errors',1);
error_reporting(E_ALL);
//START OF SCRIPT FLOW.
//START OF SCRIPT FLOW.
//Preparing Crawler & Session: Initialising Variables.
//Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only.
//Data Scraped from SiteMaps or Xml Files.
$sitemaps = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml).
$sitemaps_last_mods = []; //This will list dates of SiteMap pages last modified - found on Sitemaps.
$sitemaps_change_freqs = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps.
$sitemaps_priorities = []; //This will list SiteMap pages priorities - found on Sitemaps.
//Data Scraped from SiteMaps or Xml Files.
$html_page_urls = []; //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml).
$html_page_last_mods = []; //This will list dates of html pages last modified - found on Sitemap.
$html_page_change_freqs = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps.
$html_page_priorities = []; //This will list html pages priorities - found on Sitemaps.
//Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only.
//Data Scraped from Html Files. Not Xml SiteMap Files.
$html_page_meta_names = []; //This will list crawled pages Meta Tag Names - found on html pages.
$html_page_meta_descriptions = []; //This will list crawled pages Meta Tag Descriptions - found on html pages.
$html_page_titles = []; //This will list crawled pages Titles - found on html pages.
// -----
//Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point.
//Crawl Session Starting Page/Initial Xml Sitemap.
$initial_url = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files.
$xml = file_get_contents($initial_url); //Should I stick to this line or below line ?
//Parse the sitemap content to object
//$xml = simplexml_load_string($initial_url); //Should I stick to this line or above line ?
$dom = new DOMDocument();
$dom->loadXML($xml);
echo __LINE__; echo '<br>'; //LINE: 334
extract_links($xml);
echo __LINE__; echo '<br>'; //LINE: 338
foreach($sitemaps AS $sitemap)
{
echo __LINE__; echo '<br>';
extract_links($sitemap); //Extract Links on page.
}
foreach($html_page_urls AS $html_page_url)
{
echo __LINE__; echo '<br>';
extract_links($html_page_url); //Extract Links on page.
}
scrape_page_data(); //Scrape Page Title & Meta Tags.
//END OF SCRIPT FLOW.
//DUNCTIONS BEYOND THIS POINT.
//Links Extractor.
function extract_links()
{
echo __LINE__; echo '<br>'; //LINE: 361
GLOBAL $dom;
//Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.).
if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
{
echo __LINE__; echo '<br>';
//parse the index
// retrieve properties from the sitemap object
foreach ($xml->urlset as $urlElement) //Extracts html file urls.
{
// get properties
$sitemaps[] = $sitemap_url = $urlElement->loc;
$sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
$sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
$sitemaps_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $sitemap_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
{
echo __LINE__; echo '<br>';
//parse url set
// retrieve properties from the sitemap object
foreach ($xml->sitemapindex as $urlElement) //Extracts Sitemap Urls.
{
// get properties
$html_page_urls[] = $html_page_url = $urlElement->loc;
$html_page_last_mods[] = $last_mod = $urlElement->lastmod;
$html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
$html_page_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $html_page_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
GLOBAL $sitemaps;
GLOBAL $sitemaps_last_mods;
GLOBAL $sitemaps_change_freqs;
GLOBAL $sitemaps_priorities;
GLOBAL $html_page_urls;
GLOBAL $html_page_last_mods;
GLOBAL $html_page_change_freqs;
GLOBAL $html_page_priorities;
echo 'SiteMaps Crawled: ---'; echo '<br><br>';
if(array_count_values($sitemaps)>0)
{
print_r($sitemaps);
echo '<br>';
}
elseif(array_count_values($sitemaps_last_mods)>0)
{
print_r($sitemaps_last_mods);
echo '<br>';
}
elseif(array_count_values($sitemaps_change_freqs)>0)
{
print_r($sitemaps_change_freqs);
echo '<br>';
}
elseif(array_count_values($sitemaps_priorities)>0)
{
print_r($sitemaps_priorities);
echo '<br><br>';
}
echo 'Html Pages Crawled: ---'; echo '<br><br>';
if(array_count_values($html_page_urls)>0)
{
print_r($html_page_urls);
echo '<br>';
}
if(array_count_values($html_page_last_mods)>0)
{
print_r($html_page_last_mods);
echo '<br>';
}
if(array_count_values($html_page_change_freqs)>0)
{
print_r($html_page_change_freqs);
echo '<br>';
}
if(array_count_values($html_page_priorities)>0)
{
print_r($html_page_priorities);
echo '<br>';
}
}
//Meta Data & Title Extractor.
function scrape_page_data()
{
GLOBAL $html_page_urls;
if(array_count_values($html_page_urls)>0)
{
foreach($html_page_urls AS $url)
{
// https://www.php.net/manual/en/function.file-get-contents
$html = file_get_contents($url);
//https://www.php.net/manual/en/domdocument.construct.php
$doc = new DOMDocument();
// https://www.php.net/manual/en/function.libxml-use-internal-errors.php
libxml_use_internal_errors(true);
// https://www.php.net/manual/en/domdocument.loadhtml.php
$doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING);
// https://www.php.net/manual/en/function.libxml-clear-errors.php
libxml_clear_errors();
// https://www.php.net/manual/en/domdocument.getelementsbytagname.php
$meta_tags = $doc->getElementsByTagName('meta');
// https://www.php.net/manual/en/domnodelist.item.php
if ($meta_tags->length > 0)
{
// https://www.php.net/manual/en/class.domnodelist.php
foreach ($meta_tags as $tag)
{
// https://www.php.net/manual/en/domnodelist.item.php
echo 'Meta Name: ' .$meta_name = $tag->getAttribute('name'); echo '<br>';
echo 'Meta Content: ' .$meta_content = $tag->getAttribute('content'); echo '<br>';
$html_page_meta_names[] = $meta_name;
$html_page_meta_descriptions[] = $meta_content;
}
}
//EXAMPLE 1: Extract Title
$title_tag = $doc->getElementsByTagName('title');
if ($title_tag->length>0)
{
echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>';
$html_page_titles[] = $title;
}
//EXAMPLE 2: Extract Title
$title_tag = $doc->getElementsByTagName('title');
for ($i = 0; $i < $title_tag->length; $i++) {
echo 'Title: ' .$title = $title_tag->item($i)->nodeValue . "\n";
$html_page_titles[] = $title;
}
}
}
}
if(array_count_values($html_page_meta_names)>0)
{
print_r($html_page_meta_names);
echo '<br>';
}
if(array_count_values($html_page_meta_descriptions)>0)
{
print_r($html_page_meta_descriptions);
echo '<br>';
}
if(array_count_values($html_page_titles)>0)
{
print_r($html_page_titles);
echo '<br>';
}
//END OF FUNCTIONS.
?>
Drat! I give-up for the night!
@rprofitt
What is wrong with my above code ? Why is it failing to extract the page titles ?
@reverend_jim
Why my code failing to extract the meta datas ?
@dani
I notice, above I got things wrong way round. Notice the foreach on both ...
if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
{
echo __LINE__; echo '<br>';
//parse the index
// retrieve properties from the sitemap object
foreach ($xml->sitemapindex as $urlElement) //Extracts html file urls.
{
// get properties
$sitemaps[] = $sitemap_url = $urlElement->loc;
$sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
$sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
$sitemaps_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $sitemap_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
{
echo __LINE__; echo '<br>';
//parse url set
// retrieve properties from the sitemap object
foreach ($xml->urlset as $urlElement) //Extracts Sitemap Urls.
{
// get properties
$html_page_urls[] = $html_page_url = $urlElement->loc;
$html_page_last_mods[] = $last_mod = $urlElement->lastmod;
$html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
$html_page_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $html_page_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
Now fixed it but issue still remains:
if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links.
{
echo __LINE__; echo '<br>';
//parse the index
// retrieve properties from the sitemap object
foreach ($xml->urlset as $urlElement) //Extracts html file urls.
{
// get properties
$sitemaps[] = $sitemap_url = $urlElement->loc;
$sitemaps_last_mods[] = $last_mod = $urlElement->lastmod;
$sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq;
$sitemaps_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $sitemap_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links.
{
echo __LINE__; echo '<br>';
//parse url set
// retrieve properties from the sitemap object
foreach ($xml->sitemapindex as $urlElement) //Extracts Sitemap Urls.
{
// get properties
$html_page_urls[] = $html_page_url = $urlElement->loc;
$html_page_last_mods[] = $last_mod = $urlElement->lastmod;
$html_page_change_freqs[] = $change_freq = $urlElement->changefreq;
$html_page_priorities[] = $priority = $urlElement->priority;
// print out the properties
echo 'url: '. $html_page_url . '<br>';
echo 'lastmod: '. $last_mod . '<br>';
echo 'changefreq: '. $change_freq . '<br>';
echo 'priority: '. $priority . '<br>';
echo '<br>---<br>';
}
}
**
( ! ) Warning: DOMDocument::loadXML(): Start tag expected, '<' not found in Entity, line: 6 in C:\wamp64\www\Work\buzz\Templates\crawler_Test.php on line 336
Call Stack
1 0.0012 362584 {main}( ) ...\crawler_Test.php:0
2 2.4234 365600 loadXML( $source = class SimpleXMLElement { public $sitemap = [0 => class SimpleXMLElement { ... }, 1 => class SimpleXMLElement { ... }, 2 => class SimpleXMLElement { ... }, 3 => class SimpleXMLElement { ... }] } ) ...\crawler_Test.php:336
338
365
SiteMaps Crawled: ---
Array ( )
Html Pages Crawled: ---
Array ( )
Array ( )
Array ( )
Array ( )
342
Array ( )
Array ( )
Array ( ) Bold Text Here**
We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.