The following program is able to download the python programs contained in a thread of the python forum. Just start the program, it will prompt you for the thread number and create a directory with the code extracted from the thread. I used it to download all the wx examples. Note that the algorithm is very primitive: only the code written in language tags is extracted (raw code is not), and data attached to posts is not downloaded. Also it's not robust, if the formatting in daniweb changes tomorrow, it won't work anymore :)
#!/usr/bin/env python
# danidown.py
from htmllib import HTMLParser
from formatter import AbstractFormatter ,AbstractWriter ,NullWriter
import re
from os import mkdir
from os .path import isdir ,join as pjoin
from urllib2 import urlopen
class aThread (object ):
itsCntPattern =re .compile (r"Page \d+ of (\d+)")
def __init__ (o ,theThreadNumber ):
o .itsNumber =theThreadNumber
o ._itsPageCnt =None
o .itsReply =0
o .itsCodeIndex =0
def itsUrl (o ,thePage =1 ):
x =""if (thePage ==1 )else "-%d"%thePage
return "http://www.daniweb.com/forums/thread%d%s.html"%(
o .itsNumber ,x )
def itsContent (o ,thePage =1 ):
theUrl =o .itsUrl (thePage )
f =urlopen (theUrl )
s =f .read ()
f .close ()
return s
@property
def itsPageCnt (o ):
if o ._itsPageCnt is None :
o ._itsPageCnt =1
theContent =o .itsContent (1 )
theMatch =o .itsCntPattern .search (theContent )
if theMatch is not None :
o ._itsPageCnt =int (theMatch .group (1 ))
return o ._itsPageCnt
@property
def itsTriples (o ):
theCnt =o .itsPageCnt
printMessage ("The thread contains %d pages..."%theCnt )
for i in xrange (1 ,theCnt +1 ):
printMessage ("Page %d..."%i )
theWriter =Writer1 ()
theParser =HTMLParser (AbstractFormatter (theWriter ))
theContent =o .itsContent (i )
theParser .feed (theContent )
theParser .close ()
for theTriple in theWriter .itsTriples :
yield theTriple
@property
def itsFolder (o ):
return "thread%d"%o .itsNumber
def itsReplyFolder (o ,n ):
return pjoin (o .itsFolder ,"reply%d"%n )
def doDownload (o ):
for theReply ,theAuthor ,theCode in o .itsTriples :
theFolder =o .itsReplyFolder (theReply )
if theReply >o .itsReply :
printMessage ("reply %d..."%theReply )
if o .itsReply ==0 :
if not isdir (o .itsFolder ):
mkdir (o .itsFolder )
o .itsReply =theReply
o .itsCodeIndex =0
if not isdir (theFolder ):
mkdir (theFolder )
f =open (pjoin (theFolder ,"author"),"w")
f .write (theAuthor +"\n")
f .close ()
o .itsCodeIndex +=1
f =open (pjoin (theFolder ,"prog%d.py"%o .itsCodeIndex ),"w")
f .write (theCode )
f .close ()
print "done."
class Writer1 (NullWriter ):
def __init__ (o ):
NullWriter .__init__ (o )
o .isInCode =False
o .itsCode =None
o .itsAuthor ="unknown"
o .itsAnswer =0
o .justReadAuthor =False
o .nextIsAuthor =False
o .nextIsNumber =False
o .itsTriples =[]
def send_label_data (o ,data ):
#print "send_label_data(%s)" % repr(data)
if o .isInCode :
assert (data [-1 ]==".")
n =int (data [:-1 ])
o .itsCode .append ([])
assert (len (o .itsCode )==n )
elif data =="1.":
o .isInCode =True
o .itsCode =[[]]
def send_literal_data (o ,data ):
#print "send_literal_data(%s)" % repr(data)
if o .isInCode and data !="\xa0":
o .itsCode [-1 ].append (data )
def send_line_break (o ):
if o .isInCode :
o .itsCode [-1 ]="".join (o .itsCode [-1 ])
def send_paragraph (o ,data ):
#print "send_paragraph(%s)" % repr(data)
if o .isInCode :
theCode ="\n".join (o .itsCode )
o .itsCode =None
o .isInCode =False
o .itsTriples .append ((o .itsAnswer ,o .itsAuthor ,theCode ))
def send_flowing_data (o ,data ):
if o .nextIsNumber :
o .itsAnswer =int (data )
o .nextIsNumber =False
elif o .justReadAuthor :
if data ==" #":
o .nextIsNumber =True
o .justReadAuthor =False
elif o .nextIsAuthor :
o .itsAuthor =data
o .justReadAuthor =True
o .nextIsAuthor =False
elif data .startswith (" Solved Threads:"):
o .nextIsAuthor =True
def printMessage (msg ):
print msg
if __name__ =="__main__":
n =int (raw_input ("Enter thread number: "))
theThread =aThread (n )
theThread .doDownload ()