hi
im trying to make a program that will go to 4chan and download a the images on a thread(i.e. http://4chan.org/b). the program will work the first time but after that when i go to run it again it trys to download the same urls as it did the first time it ran, and those have 404'ed. please help. also 4chan is NSFW (though most of you probably know about it, but just in case.
Edit:
i just tested the code on 4chan.org/s and it works perfectly. so i dont know why its not working on 4chan.org/b
heres my code:
import os
import sys
import urllib
import urllib2
def geturl(url, dst):
## dst = str('chan/')+str(dst)
## pic = urllib2.urlopen(url)
## output = open(dst,'wb')
## output.write(pic.read())
## output.close()
#print "get url '%s' to '%s'" % (url, dst)
if sys.stdout.isatty():
dst = str('chan/')+str(dst)
urllib.urlretrieve(url, dst)
base = os.path.basename(url)
else:
urllib.urlretrieve(url, dst)
url = 'http://www.4chan.org/s'
f = urllib2.urlopen(url)
siteinfo = []
for lines in f:
lines = lines.replace('\n','')
siteinfo.append(lines)
#look for symbol
temp = 0
urls = []
image = None
imagename=[]
base_url=[]
while temp<len(siteinfo):
try:
if '.jpg' in siteinfo[temp]:
image = str(siteinfo[temp])
temp1 = image.index('http')
image = image[temp1:]
temp1 = image.rindex('jpg')
temp1+=3
image = image[:temp1]
if " " in image:
try:
temp1 = image.index(' ')
image = image[:temp1]
except:
pass
if ".jpg\"" in image:
try:
temp1 = image.index("\"")
image = image[:temp1]
except:
pass
urls.append(image)
temp5 = image.rindex('/')
temp5+=1
temp5 = image[temp5:]
imagename.append(temp5)
except:
pass
temp += 1
temp9 = 0
while temp9 < len(urls):
tempa = urls[temp9]
temp1 = tempa[:tempa.rindex('/')+1]
base_url.append(temp1)
temp9 += 1
temp = 0
for i in urls:
print i
if '.jpg' in i:
print 'good'
else:
del urls[temp]
del imagename[temp]
del base_url[temp]
temp+=1
folder_files=[]
path="chan"
dirList=os.listdir(path)
for fname in dirList:
folder_files.append(fname)
temp = 0
while temp<imagename:
try:
file_name = imagename[temp]
if file_name not in imagename[temp+1:]:
#if file_name not in folder_files:
url = base_url[temp]
full_url = str(url)+str(file_name)
geturl(full_url, file_name)
print'Downloaded:', file_name
temp+=1
except:
break
print'\nDownloads Complete'
f.close()