I've been working on this webcrawler and I've ran into a problem. I can read the first URL and get all the URLs out of the HTML code, but I can't seem to set up a looping structure that will work.
This is basically what it does:
Searches through html of first URL.
It may find, say 20 other URLs contained in that one.
stops.
How can I make it to were it would continously search through the ones that were found?
Here is the code I have so far, but it's not complete:
import java.io.*;
import java.net.*;
import java.util.*;
public class CustomWebCrawler implements Runnable
{
ArrayList alCurrentSearches = new ArrayList();
ArrayList alAlreadySearched = new ArrayList();
ArrayList alMatchingSearches = new ArrayList();
Thread running;
URL enteredURL;
int count = 0;
public CustomWebCrawler()
{
}
public void start()
{
if (running == null)
{
running = new Thread();
}
}
public void stop()
{
if (running != null)
{
running = null;
}
}
public void run()
{
if (enteredURL == null || enteredURL.getProtocol().compareTo("http") != 0)
{
running = null;
}
alCurrentSearches.add(enteredURL);
BufferedReader br = null;
try
{
br = new BufferedReader(new InputStreamReader(enteredURL.openStream()));
String inputText = "";
while ((inputText = br.readLine()) != null)
{
int first = inputText.lastIndexOf("<a href=");
int end = inputText.indexOf("\">",first);
if (first != -1 && end != -1)
{
findURL(inputText,first,end);
}
else
{
}
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void findURL(String text, int numFirst, int numEnd)
{
String link = text.substring(numFirst+9, numEnd);
try
{
URL newURL = new URL(link);
if (newURL.getProtocol().compareTo("http") == 0)
{
if (!(alMatchingSearches.contains(newURL)))
{
alAlreadySearched.add(newURL);
alMatchingSearches.add(newURL);
System.out.println(newURL + "");
}
}
}
catch(MalformedURLException mue)
{
}
}
}