Hi, So my intention is to grab a relative html file and extract all relative links and then open those links to extract the relative links from those files and so on.
Pretty much I need to build an html tree from an initial html file.
I have written some code below that takes arguments for relative or absolute with file name and outputs all links to a text file, the only problem now is I need to pass back those lines of filenames so that the program can run recursively and grab all chained html pages.
To run, I enter in command prompt filename.pl -r filename.htm
Here is my code any suggestions?
Thanks in advance!
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
use LWP::Simple;
use HTML::Parser;
#
# Grab all links from local or remote html file
# perl html munging
#
# option -a (/ -r) grabs only absolute (/ relative) urls
# get options and argument
#
my %opts;
getopts('ar', \%opts);
my $arg = shift;
die "Usage: $0 [-a | -r] filename [| URL]\n"
if (not defined $arg or $opts{a} && $opts{r}); # allow either -a or -r
# get the page either from file or url
#
my $page;
if ($arg =~ m!^http://!) {
$page = get($arg)
or die "Couldn't get $arg: $!\n";
}
else {
open FH, "<", $arg
or die "Couldn't open $arg: $!\n";
$page = do { local $/; <FH> };
close FH;
}
# set the parser and parse
#
my $parser = HTML::Parser->new( api_version => 3,
start_h => [\&start,"tagname, attr"],
);
my @links;
sub start {
my ($tag, $attr) = @_;
if ($tag =~ /^a$/ and defined $attr->{href}) {
return
if ($attr->{href} =~ m!^http://! and $opts{r}); # exclude absolute url when -r
return
if ($attr->{href} !~ m!http://! and $opts{a}); # exclude relative url when -a
push @links, $attr->{href};
}
}
$parser->parse($page);
$parser->eof;
# output
#
my $append = 0;
if ($append)
{
open(MYOUTFILE, ">TreeList.txt"); #open for write, overwrite
}
else
{
open(MYOUTFILE, ">>TreeList.txt"); #open for write, append
}
#print MYOUTFILE
map {print MYOUTFILE "$_\n"} @links;
#*** Close the file ***
close(MYOUTFILE);