I have been pm about this question from Crohole so many times that I thought I would setup a topic where others can join in. Crohole has asked about setting up a bot like I have many times but his questions seem to be looping back to the beginning. So I'll explain briefly what to do for making a bot and if Crohole has any more problems he could post here instead of sending lots of pm's to me. So first the basic script that I have setup for a bot is as follows:
<form method="post">Scan site: <input type="text" name="site" value="http://" style="width:300px">
<input value="Scan" type="submit"></form>
<?
set_time_limit (0);
if (!function_exists('stripos')) {
function stripos($str,$needle,$offset=0) {
return strpos(strtolower($str),strtolower($needle),$offset);
}
}
if (isset($_POST['site']) && !empty($_POST['site'])) {
/* Formats Allowed */
$formats=array('html'=>true,'htm'=>true,'xhtml'=>true,'xml'=>true,'mhtml'=>true,'xht'=>true,
'mht'=>true,'asp'=>true,'aspx'=>true,'adp'=>true,'bml'=>true,'cfm'=>true,'cgi'=>true,
'ihtml'=>true,'jsp'=>true,'las'=>true,'lasso'=>true,'lassoapp'=>true,'pl'=>true,'php'=>true,
'php1'=>true,'php2'=>true,'php3'=>true,'php4'=>true,'php5'=>true,'php6'=>true,'phtml'=>true,
'shtml'=>true,'search'=>true,'query'=>true,'forum'=>true,'blog'=>true,'1'=>true,'2'=>true,
'3'=>true,'4'=>true,'5'=>true,'6'=>true,'7'=>true,'8'=>true,'9'=>true,'10'=>true,'11'=>true,
'12'=>true,'13'=>true,'14'=>true,'15'=>true,'16'=>true,'17'=>true,'18'=>true,'19'=>true,
'20'=>true,'01'=>true,'02'=>true,'03'=>true,'04'=>true,'05'=>true,'06'=>true,'07'=>true,
'08'=>true,'09'=>true,'go'=>true,'page'=>true,'file'=>true);
function domain ($ddomain) {
return preg_replace('/^((http(s)?:\/\/)?([^\/]+))(.*)/','$1',$ddomain);
}
function url_exists($durl)
{
// Version 4.x supported
$handle = curl_init($durl);
if (false === $handle)
{
return false;
}
curl_setopt($handle, CURLOPT_HEADER, true);
curl_setopt($handle, CURLOPT_FAILONERROR, true); // this works
curl_setopt($handle, CURLOPT_HTTPHEADER,
Array("User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.15) Gecko/20080623 Firefox/2.0.0.15") );
curl_setopt($handle, CURLOPT_NOBODY, true);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
$connectable = curl_exec($handle);
curl_close($handle);
if (stripos(substr_replace($connectable,'',30),'200 OK')) {
return true;
} else {
return false;
}
}
$fdata='';
//below function will only get links within own domain and not links outside the site.
function getlinks($generateurlf) {
global $formats;
global $f_data;
$f_data=file_get_contents($generateurlf);
$datac=$f_data;
preg_match_all('/(href|src)\=(\"|\')([^\"\'\>]+)/i',$datac,$media);
unset($datac);
$datac=$media[3];
unset($media);
$datab=array();
$str_start=array('http'=>true,'www.'=>true);
foreach($datac AS $dfile) {
$generateurle=$generateurlf;
$format=strtolower(preg_replace('/(.*)[.]([^.\?]+)(\?(.*))?/','$2',basename($generateurle.$dfile)));
if (!isset($str_start[substr_replace($dfile,'',4)])) {
if (substr_replace($generateurle,'',0, -1)!=='/') {
$generateurle=preg_replace('/(.*)\/[^\/]+/is', "$1", $generateurle);
} else {
$generateurle=substr_replace($generateurle,'',-1);
}
if (substr_replace($dfile,'',1)=='/') {
if (domain($generateurle)==domain($generateurle.$dfile)) {
if (isset($formats[$format])
|| substr($generateurle.$dfile,-1)=='/' || substr_count(basename($generateurle.$dfile),'.')==0) {
$datab[]=$generateurle.$dfile;
}
}
} else if (substr($dfile,0,2)=='./') {
$dfile=substr($dfile,2);
if (isset($formats[$format])) {$datab[]=$generateurle.'/'.$dfile;}
} else if (substr_replace($dfile,'',1)=='.') {
while (preg_match('/\.\.\/(.*)/i', $dfile)) {
$dfile=substr_replace($dfile,'',0,3);
$generateurle=preg_replace('/(.*)\/[^\/]+/i', "$1", $generateurle);
}
if (domain($generateurle)==domain($generateurle.'/'.$dfile)) {
if (isset($formats[$format]) || substr($generateurle.'/'.$dfile,-1)=='/'
|| substr_count(basename($generateurle.'/'.$dfile),'.')==0) {
$datab[]=$generateurle.'/'.$dfile;
}
}
} else {
if (domain($generateurle)==domain($generateurle.'/'.$dfile)) {
if (isset($formats[$format]) || substr($generateurle.'/'.$dfile,-1)=='/'
|| substr_count(basename($generateurle.'/'.$dfile),'.')==0) {
$datab[]=$generateurle.'/'.$dfile;
}
}
}
} else {
if (domain($generateurle)==domain($dfile)) {
if (isset($formats[$format]) || substr($dfile,-1)=='/' || substr_count(basename($dfile),'.')==0) {
$datab[]=$dfile;
}
}
}
unset($format);
}
unset($datac);
unset($dfile);
return $datab;
}
//=============================================
/* Modify only code between these two lines and $formats variable above. */
function generate($url) {
echo $url.'<br>';
global $f_data; //Data of file contents
//do something with webpage $f_data.
unset($f_data);
}
//=============================================
// Below is what actually process the search engine
$sites=array();
$sites[]=stripslashes($_POST['site']);
for ($i=0;isset($sites[$i]);$i++) {
foreach (getlinks(stripslashes($sites[$i])) AS $val) {
if (!isset($sites[$val])) {
$sites[]=$val;
$sites[$val]=true;
}
} unset($val);
if (url_exists($sites[$i])) {
generate($sites[$i]);
flush();
}
}
}
?>
Now the only part that really needs changing for recording results is the generate() function. That is where that function is defined and is clearly marked between to long commented bars. That is how it goes so any problems Crohole then post here instead of pm-ing me.