I have the following script:
import sys
import os
import re
pages = []
if len(sys.argv) > 1:
for root, dirs, files in os.walk(sys.argv[1]):
for f in files:
filename = os.path.join(root, f)
if (filename.endswith('.html')) or (filename.endswith('.htm')):
pages.append(filename)
for page in pages:
f = open(page, "r")
count = 1
for line in f:
if re.match("<a href=\"", line):
split1 = re.split("<a href=\"", line)[1]
split = re.split("\"", split1)
if not os.path.exists(split[0]):
if not split[0].startswith("www"):
print "################### Link Error ###################"
print "Error with line %i in file %s, the page '%s' doesn't exist:" % (count, page, split[0])
print line
elif re.match("<a href='", line):
split1 = re.split("<a href='", line)[1]
split = re.split("'", split1)
if not os.path.exists(os.path.abspath(split[0])):
if not split[0].startswith("www"):
print "################### Link Error ###################"
print "Error with line %i in file %s, the page '%s' doesn't exist:" % (count, page, split[0])
print line
count += 1
when run:
$ python linkvalidator.py /home/matio/Documents/website/
################### Link Error ###################
Error with line 50 in file /home/matio/Documents/website/fun.htm, the page '06 - Track 6.mp3' doesn't exist:
<a href="06 - Track 6.mp3">Some music!</a>
################### Link Error ###################
Error with line 25 in file /home/matio/Documents/website/school_quiz.html, the page 'fun.htm' doesn't exist:
<a href="fun.htm">Back</a>
################### Link Error ###################
Error with line 11 in file /home/matio/Documents/website/imagemap/overview.htm, the page '../map.htm' doesn't exist:
<a href="../map.htm">Back to map page</a>
when i ls:
$ ls ~/Documents/website/
06 - Track 6.mp3 fun.htm jblock.gif miss.htm~ quote10.html resources student_office.htm video.htm
art.png fun.htm~ les.htm mr_.htm quote10.html~ r.png student_office.htm~ video.htm~
b.png german_songs.htm lessons.htm~ mr.htm~ quote1.html school_quiz.html style_change.js videos
buddies.htm german_songs.htm~ library.png mrs_kane.htm quote2.html school_quiz.html~ style_change.js~ v_map.htm
buddies.htm~ gi.htm main_office.htm mrs_kane.htm~ quote3.html school_quiz_process.js style.htm v_map.htm~
canteen.htm gi.htm~ main_office.htm~ office.png quote4.html school_quiz_process.js~ style.htm~ year_6_2.htm
canteen.htm~ imagemap map2.htm overview.jpg quote5.html slideshow.htm Templates year_6_2.htm~
chapel.png images map.htm parents.htm quote6.html slideshow.htm~ test.htm year_6.htm
cl.htm index.html map.htm~ q_a.htm quote7.html sportshall_outside.png tour.htm year_6.htm~
credits.htm interviews.htm mblock.jpg q_a.htm~ quote8.html spotshall.png video_2.htm~
css interviews.htm~ miss.htm quote0.html quote9.html stained_glass.png video_3.htm~