i have a code writen for a domain, i want to modify for another domain (http://www.apbspeakers.com/speakers) scraping, but i dont know css elements from new domain to start i think:__doPostBack('ctl00$pid-speakers');
and var nodes = __utils__.getElementsByXPath("//a[ 'pid-speakers']");
????
, here is the code:
var utils = require('utils');
var casper = require('casper').create();
casper.start("http://www.wmespeakers.com/Speakers.aspx");
casper.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0');
casper.thenEvaluate(function()
{
__doPostBack('ctl00$ContentPlaceHolder1$Menu1','Alphabetical');
}
);
var urls = [], counter;
casper.then(function()
{
urls = this.evaluate(function()
{
var nodes = __utils__.getElementsByXPath("//a[starts-with(@id, 'ContentPlaceHolder1_SpeakersAlphabetically1_lstSpeakers_DataList1')]");
return Array.prototype.map.call(nodes, function(e)
{
return e.getAttribute("href");
}
);
}
);
}
);
casper.then(function()
{
casper.echo(urls.length + ' links found:');
counter = 0;
}
);
function check()
{
if (counter < urls.length) //
{
parseLink.call(this, 'http://www.wmespeakers.com/' + urls[counter]);
counter++;
this.run(check);
}
else
{
this.echo("All done.");
this.exit();
}
}
var result;
function parseLink(link)
{
this.start(link);
//this.open(link);
this.then(function()
{
result = this.evaluate(function()
{
var xpaths = {
'name' : "//span[@id='ContentPlaceHolder1_lblSpeakerDisplayName']",
"photo_url": "//img[@id='ContentPlaceHolder1_speakerImage']/@src",
"twitter" : "//a[text()=' Twitter']/@href",
"website" : "//a[contains(text(), 'Official Website')]/@href"
};
var xpaths1 = {
'description' : "//span[@id='ContentPlaceHolder1_lblTitle'] | //span[@id='ContentPlaceHolder1_lblPosition']",
'bio' : "//span[@id='ContentPlaceHolder1_lblBlurb'] | //span[@id='lblLongBlurb']",
"speeches-title" : "//span[starts-with(@id, 'ContentPlaceHolder1_ctrlSpeeches_lstSpeeches_lblTitle_')]",
"speeches-description" : "//span[starts-with(@id, 'ContentPlaceHolder1_ctrlSpeeches_lstSpeeches_lblTitle_')]/../following-sibling::span[1]",
"videos-title" : "//a[starts-with(text(), ' VIDEO:')]",
"videos-url" : "//a[starts-with(text(), ' VIDEO:')]/@href",
"reviews-organization" : "//span[starts-with(@id, 'WmeTestimonials1_lstTestimonials_Label2_')]",
"reviews-body" : "//span[starts-with(@id, 'WmeTestimonials1_lstTestimonials_Label2_')]/../preceding-sibling::text()[string-length() > 10]",
"books" : "//div[@class='notableWorkWrapper']/a[text() = 'Buy Now']/@href"
};
//topics, travels_from, fee
var temp = {};
var t;
for(var query in xpaths)
{
if(xpaths.hasOwnProperty(query))
{
t = __utils__.getElementByXPath(xpaths[query]);
if(t !== undefined)
{
temp[query] = t.textContent;
}
else
{
temp[query] = '';
}
}
}
for(var query in xpaths1)
{
if(xpaths1.hasOwnProperty(query))
{
t = __utils__.getElementsByXPath(xpaths1[query]);
if(t.length == 0)
{
temp[query] = [];
}
else
{
temp[query] = Array.prototype.map.call(t, function(e)
{
return e.textContent;
}
);
}
if(query == 'description' || query == 'bio')
{
temp[query] = temp[query].join(' , ');
}
if(query == 'speeches-description')
{
t = [];
for(var i = 0; i < temp[query].length; i++)
{
t.push({
"title" : temp['speeches-title'][i],
"description" : temp['speeches-description'][i]
}
);
}
temp['speeches'] = t;
delete temp['speeches-description'];
delete temp['speeches-title'];
}
if(query == 'videos-url')
{
var t = [];
for(var i = 0; i < temp[query].length; i++)
{
t.push({
"title" : temp['videos-title'][i],
"url" : temp['videos-url'][i]
}
);
}
temp['videos'] = t;
delete temp['videos-url'];
delete temp['videos-title'];
}
if(query == 'reviews-body')
{
var t = [];
for(var i = 0; i < temp[query].length; i++)
{
t.push({
"organization" : temp['reviews-organization'][i],
"body" : temp['reviews-body'][i]
}
);
}
temp['reviews'] = t;
delete temp['reviews-organization'];
delete temp['reviews-body'];
}
}
}
return temp;
}
);
utils.dump(result);
}
);
}
casper.run(check);