Hi all,
I recently made a web crawler as part of a Uni team project building a small search engine. I've managed to incorporate it into our GUI, there is a button which starts the crawling process by creating the crawler in a thread. (I did this because otherwise the button would freeze while the crawler worked)
However I would also like the crawler to update a JLabel on the GUI while it's working, i.e. how many it's done and how many it has to go.
How can I set this? I've tried a while loop in the button event, this doesn't work. Also tried setting it from the crawler but it cannot access from outside the object. How would you solve this?
Any pointers or advice in general is appreciated also. I enjoy programming and am eager to learn if I'm doing anything wrong or could improve any of my practices. Thank you :)
CrawlAnalyseGUI.java
import javax.swing.*;
import java.awt.*;
import java.awt.event.*;
import java.lang.Thread;
public class CrawlAnalyseGUI extends JFrame
{
private JButton crawlButton, searchButton;
private JTextField searchField;
private JLabel crawlerStatus;
private JPanel searchPanel, crawlerPanel;
public CrawlAnalyseGUI()
{
final WebCrawler crawler = new WebCrawler();
searchField = new JTextField("Enter Search Term", 10);
searchButton = new JButton("Search");
searchButton.addMouseListener(new MouseAdapter()
{
@Override
public void mouseReleased(MouseEvent e)
{
// do search!
}
});
searchPanel = new JPanel();
searchPanel.add(searchField);
searchPanel.add(searchButton);
crawlerStatus = new JLabel("Waiting to start...");
crawlButton = new JButton("Start Crawler");
crawlButton.addMouseListener(new MouseAdapter()
{
@Override
public void mouseReleased(MouseEvent e)
{
String buttonText = crawlButton.getText();
if(buttonText.equals("Start Crawler"))
{
crawler.start();
/* does not update label...
while(crawler.Q.size() > 1)
{
crawlerStatus.setText("Left in queue: "+ crawler.Q.size() +", and "+ crawler.FoundDocs.size() +"Pages crawled.");
}
*/
crawlButton.setText("Stop Crawler");
}
else
{
// deprecated/unsafe. What should I do instead?
crawler.stop();
}
}
});
crawlerPanel = new JPanel();
crawlerPanel.add(crawlerStatus);
crawlerPanel.add(crawlButton);
this.setLayout(new GridLayout(2, 1, 5, 5));
this.add(searchPanel);
this.add(crawlerPanel);
}
public static void main(String[] args)
{
CrawlAnalyseGUI gui = new CrawlAnalyseGUI();
gui.setSize(400,200);
gui.setVisible(true);
gui.setResizable(false);
gui.setDefaultCloseOperation(EXIT_ON_CLOSE);
}
}
WebCrawler.java
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.SocketException;
import java.lang.Thread;
import java.util.ArrayList;
public class WebCrawler extends Thread
{
ArrayList<CrawledDoc> FoundDocs = new ArrayList<CrawledDoc>();
ArrayList<String> Q = new ArrayList<String>();
ArrayList<String> backupQ = new ArrayList<String>();
boolean correctProtocol = true;
Document page = null;
Element firstP;
Elements links;
int URLcount = 0, docID = 0;
String[] initialURLs = {"http://www.bbc.co.uk", "http://www.msn.co.uk", "http://www.engadget.com", "http://www.skynews.com", "http://www.youtube.com", "http://www.theregister.co.uk", "http://en.wikipedia.org/wiki/Main_Page"};
String title, content, abslink = null, stemmed;
TextAnalyser Analyse = new TextAnalyser();
public void run()
{
// add beginning URLs to Queue
for(int counter = 0; counter < initialURLs.length; counter++)
Q.add(initialURLs[counter]);
// overall loop which ends when all URLs have been crawled
while(Q.size() > 1)
{
// remove the front URL
String L = Q.remove(1);
System.out.println("Opening: "+ L);
// connect to URL and get the HTML page
try
{
page = Jsoup.connect(L).get();
}
catch(SocketException e){}
catch(IOException e){}
catch(IllegalArgumentException e){}
// retrieve all the links from the page, the first paragrah and the title.
// (all taken from the retrieved HTML).
links = page.select("a[href]");
firstP = page.select("p").first();
title = page.title();
// check that content was found, replace if not, therefore avoiding NullPointerException
if(firstP == null) content = title;
else content = firstP.text();
// loop to check every link found on the page which has been crawled
// ensure its a link to a web-page and that we don't already have it.
// then it can be added.
for(Element link : links)
{
abslink = link.attr("abs:href");
if((abslink.startsWith("ftp:")) || (abslink.startsWith("mailto:"))) correctProtocol = false;
if((!Q.contains(abslink)) && correctProtocol)
{
Q.add(abslink);
System.out.println(abslink + " has been added to the queue.");
}
}
// because a document has been opened, an object will be made
// this object needs a unique ID, so an integer keeps track and is used every time
docID++;
// Because we don't know how many tokens we may get from each page
// we make a new ArrayList each time. The content is tokenized
ArrayList<String> tokened = new ArrayList<String>();
tokened = Analyse.tokenizer(content);
// these tokens are then put through a stopper and porters algorithm.
// The output of this is saved alongside the original content (for readability)
String stopped = Analyse.stopper(tokened);
Porter porter = new Porter(stopped);
stemmed = porter.getStem();
// Object is created saving all the information about this page.
// The link is saved so that we can click on it and visit the full page
// The documentID is kept so we can distinguish between them in the inverted file.
// The title is saved so we can use it as a title for results.
// The original content is saved for readability on the results page, and the stemmed to be searched.
// Finally add this Object to the ArrayList.
CrawledDoc tempDoc = new CrawledDoc(abslink, docID, title, content, stemmed);
FoundDocs.add(tempDoc);
// I would like to update it here!
//gui.crawlerStatus.setText("Left in queue: "+ Q.size() +", and "+ FoundDocs.size() +"Pages crawled.");
// Visual representation of Object to demonstrate to group structure of it.
System.out.println("Q size: "+ Q.size());
System.out.println("\nCrawledDoc["+docID+"] = Link: "+abslink+"\nDocID: "+docID+"\nTitle:"+title+"\nContent: "+content+" \nContentStemmed: "+stemmed+"\n");
}
}
}