#!/usr/bin/env python
from optparse import OptionParser
from sys import stderr, argv, exit
from time import sleep
from threading import Thread
from Queue import Queue
from os import mkdir, path
import urllib2
import re
"""
This program automatically download images from
http://idol.sankakucomplex.com/
http://chan.sankakucomplex.com/
"""
THREAD_COUNT = 5
DEFAULT_LIMIT = 1000
SEARCH_URL =
"http://idol.sankakucomplex.com/post/index.content?page=%d&tags=%s"
class Downloader(Thread):
def __init__(self, queue, directory):
Thread.__init__(self)
self.queue = queue
self.directory = directory
def run(self):
sleeptime = 1
while True:
url = self.queue.get()
try:
filename = path.join(self.directory, re.sub(r'.*/', '', url))
#We don't have to download when we already have it
if not path.isfile(filename):
content = urllib2.urlopen(url).read()
fp = open(filename, 'wb')
fp.write(content)
fp.close()
print "Downloaded: %s" % (filename)
else:
print "Already exist: %s" % filename
self.queue.task_done()
sleeptime = 1
except urllib2.URLError, err:
code = err.getcode()
print >> stderr, "Error %d: %s" % (code, err.reason)
#retry when 5xx error
self.queue.task_done()
print "Sleep %d seconds" % (sleeptime)
sleep(sleeptime)
sleeptime += 1
if code / 100 == 5:
self.queue.put(url)
def parse_options():
parser = OptionParser(usage="Usage: %prog [options] ")
parser.add_option(
"-t", "--tag", dest="tags",
action="append", metavar="TAG",
default=[],
help="tag for find images"
)
parser.add_option(
"-n", "--limit", dest="limit",
type="int",
default=DEFAULT_LIMIT,
help="limit max count for downloading image"
)
options, args = parser.parse_args()
#directory is mandatory option
if len(args) < 1:
parser.print_help()
exit(-1)
directory = args[0]
tags = ' '.join(options.__dict__['tags'])
limit = options.__dict__['limit']
return directory, tags, limit
def check_dir(directory):
if not path.isdir(directory):
if path.isfile(directory):
return False
else:
mkdir(directory)
return True
else:
return True
def main():
queue = Queue()
(directory, tags, limit) = parse_options()
searching_url = SEARCH_URL
if directory is None:
print >> stderr,
"Usage: %s [-t|--tag tag] [-n|--limit number] "
% (argv[0])
exit(-1)
if not check_dir(directory):
print >> stderr, "mkdir failed"
exit(-1)
for i in range(THREAD_COUNT):
t = Downloader(queue, directory)
t.setDaemon(True)
t.start()
opener = urllib2.build_opener()
opener.addheaders = [
('User-Agent',
'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.31 (KHTML, like Gecko) '
'Chrome/26.0.1410.63 Safari/537.31')
]
count = 0
page = 1
sleeptime = 1
while count < limit:
try:
url = SEARCH_URL % (page, tags)
document = opener.open(url).read()
lst = re.findall(
r'', document)
for tag in lst:
if count >= limit:
break
tag = re.sub(
r'^(https?://i)[0-9](.*)/preview(.*)$',
r'1s23', tag)
queue.put(tag)
count += 1
if len(lst) == 0:
print "No more images"
break
print "Found %d image on page %d" % (len(lst), page)
page += 1
sleeptime = 1
except urllib2.URLError, err:
print >> stderr, "Error %d: %s" % (err.getcode(), err.reason)
print "Sleep %d seconds" % (sleeptime)
sleep(sleeptime)
sleeptime += 1
queue.join()
if __name__ == "__main__":
main()
# vim:set softtabstop=4 shiftwidth=4 expandtab: