sankaku crawler

#!/usr/bin/env python
from optparse import OptionParser
from sys import stderr, argv, exit
from time import sleep

from threading import Thread
from Queue import Queue

from os import mkdir, path

import urllib2
import re

"""
This program automatically download images from
http://idol.sankakucomplex.com/
http://chan.sankakucomplex.com/
"""

THREAD_COUNT = 5
DEFAULT_LIMIT = 1000
SEARCH_URL = 
    "http://idol.sankakucomplex.com/post/index.content?page=%d&tags=%s"

class Downloader(Thread):

    def __init__(self, queue, directory):
        Thread.__init__(self)
        self.queue = queue
        self.directory = directory

    def run(self):
        sleeptime = 1
        while True:
            url = self.queue.get()
            try:
                filename = path.join(self.directory, re.sub(r'.*/', '', url))
                #We don't have to download when we already have it
                if not path.isfile(filename):
                    content = urllib2.urlopen(url).read()
                    fp = open(filename, 'wb')
                    fp.write(content)
                    fp.close()
                    print "Downloaded: %s" % (filename)
                else:
                    print "Already exist: %s" % filename

                self.queue.task_done()
                sleeptime = 1
            except urllib2.URLError, err:
                code = err.getcode()
                print >> stderr, "Error %d: %s" % (code, err.reason)

                #retry when 5xx error
                self.queue.task_done()

                print "Sleep %d seconds" % (sleeptime)
                sleep(sleeptime)
                sleeptime += 1
                if code / 100 == 5:
                    self.queue.put(url)

def parse_options():
    parser = OptionParser(usage="Usage: %prog [options] ")
    parser.add_option(
        "-t", "--tag", dest="tags",
        action="append", metavar="TAG",
        default=[],
        help="tag for find images"
    )
    parser.add_option(
        "-n", "--limit", dest="limit",
        type="int",
        default=DEFAULT_LIMIT,
        help="limit max count for downloading image"
    )

    options, args = parser.parse_args()

    #directory is mandatory option
    if len(args) < 1:
        parser.print_help()
        exit(-1)

    directory = args[0]
    tags = ' '.join(options.__dict__['tags'])
    limit = options.__dict__['limit']
    return directory, tags, limit

def check_dir(directory):
    if not path.isdir(directory):
        if path.isfile(directory):
            return False
        else:
            mkdir(directory)
            return True
    else:
        return True

def main():
    queue = Queue()
    (directory, tags, limit) = parse_options()
    searching_url = SEARCH_URL

    if directory is None:
        print >> stderr, 
            "Usage: %s [-t|--tag tag] [-n|--limit number] "
            % (argv[0])
        exit(-1)

    if not check_dir(directory):
        print >> stderr, "mkdir failed"
        exit(-1)

    for i in range(THREAD_COUNT):
        t = Downloader(queue, directory)
        t.setDaemon(True)
        t.start()

    opener = urllib2.build_opener()
    opener.addheaders = [
        ('User-Agent',
         'Mozilla/5.0 (X11; Linux x86_64) '
         'AppleWebKit/537.31 (KHTML, like Gecko) '
         'Chrome/26.0.1410.63 Safari/537.31')
    ]
    count = 0
    page = 1
    sleeptime = 1
    while count < limit:
        try:
            url = SEARCH_URL % (page, tags)
            document = opener.open(url).read()
            lst = re.findall(
                r'', document)
            for tag in lst:
                if count >= limit:
                    break
                tag = re.sub(
                    r'^(https?://i)[0-9](.*)/preview(.*)$',
                    r'1s23', tag)
                queue.put(tag)
                count += 1

            if len(lst) == 0:
                print "No more images"
                break
            print "Found %d image on page %d" % (len(lst), page)
            page += 1
            sleeptime = 1
        except urllib2.URLError, err:
            print >> stderr, "Error %d: %s" % (err.getcode(), err.reason)
            print "Sleep %d seconds" % (sleeptime)
            sleep(sleeptime)
            sleeptime += 1

    queue.join()

if __name__ == "__main__":
    main()
    # vim:set softtabstop=4 shiftwidth=4 expandtab:

kjwon15

I'm a hacker, I want to improve life.

Leave a Reply

Your email address will not be published. Required fields are marked *

 

This site uses Akismet to reduce spam. Learn how your comment data is processed.