Threaded Python Imgur Album Downloader

A little while ago, I stumbled across a neat Python script for downloading full albums from imgur.com. It did the job, but I couldn’t get over the fact that it was rather slow. I opened up the source and decided to hack on it a bit.

This is what I came up with. I feel bad about not being able to credit the original author for the parser; they neglected to put their name on their work!

# Imgur Album Downloader
#
# Now with multithreading!
# Just paste the URL of an Imgur album/gallery/subdomain
# at the prompt, and away you go!
#
# Improved by Duane Sibilly <duane@sibilly.com>
# Original Author unknown. (Sorry!)
# CPU detection function courtesty of phihag <phihag@phihag.de>

import sys
import os
import re
import time
import urllib
import threading
import subprocess
import datetime
import Queue
import xml.dom.minidom as minidom

class ImgurAlbum(object):
    """ Model object for imgur albums """

    def __init__(self, name, imageList):
        if (name == ''): # empty album names suck! Make one up!
            print "No album name detected; please provide one!"
            self.name = raw_input('Album Name: ')
        else:
            self.name = name
        self.imageList = imageList

    def enqueueImages(self, queue):
        """ Adds the image list to a queue for dispatching """

        # make sure the directory exists
        # before the DownloadThreads take over!
        dirname = 'imgur/' + self.name.replace(' ', '') + '/'
        dir = os.path.dirname(dirname)
        if not os.path.exists(dir):
            os.makedirs(dir)

        # enqueue the images into the dispatch queue
        for image in self.imageList:
            queue.put(image)

class DownloadThread(threading.Thread):
    """ Threaded image downloader """

    def __init__(self, albumName, queue):
        threading.Thread.__init__(self)
        self.queue = queue
        self.albumName = albumName

    def run(self):
        while True:
            image = self.queue.get()
            origName = image[image.rfind('/') + 1:]
            fileName = 'imgur/' + self.albumName.replace(' ', '') + '/' + origName

            if os.path.exists(fileName) != True:
                try:
                    # open the local file and write the image into it...
                    output = open(fileName, 'wb')
                    imgData = urllib.urlopen(image).read()
                    output.write(imgData)
                    output.close()

                    # display a nice progress dot (without print's newline)
                    sys.stdout.write('.')
                    sys.stdout.flush()
                except:
                    print "File read error!"
            else: # File already exists; do not overwrite
                print "File %s exists!" % fileName

            # signal the dispatch queue that this task is complete
            self.queue.task_done()

class PageParser(object):
    """ Imgur gallery page parser """

    def __init__(self, url):
        self.url = url
        self.imageList = []

    def parse(self):
        self._parse(self.url)
        return ImgurAlbum(self.imageList[0], self.imageList[1:])

    def _parse(self, url):
        page = urllib.urlopen(url).read()

        if page.find('subdomain_css') != -1:
            links = self._parseSubdomain(url)

            for linkURL in links:
                test = self._parse(linkURL)

        elif page.find('album_css') != -1:
            self.imageList.extend(self._parseAlbum(url))

        elif page.find('gallery_css') != -1:
            self.imageList.extend(self._parseGallery(url))

    def _parseSubdomain(self, url):
        page = urllib.urlopen(url).read()
        links = []
        last = 0

        tag = '"cover"'

        while 1:

            last = page.find(tag, last)

            if last == -1:
                break

            links.append( "http:"+page[page.find('href=', last)+6:page.find('">', last+9)]+"/all" )

            last = last + 9

        return links

    def _parseAlbum(self, url):
        albumimages = []
        page = urllib.urlopen(url).read()

        null=False

        titleStart = page.find("data-title")+12
        albumimages.append(page[titleStart:page.find('"',titleStart)])
        # print "parsing album"

        start = page.find("images:", page.find("ImgurAlbum"))+8
        rawAlbumdata = page[start: page.find("]}", start)+2]

        albumdata = eval(rawAlbumdata)

        for i in albumdata["items"]:
            albumimages.append( "http://i.imgur.com/"+i["hash"]+i["ext"] )

        return albumimages

    def _parseGallery(self, url):
        gallery = urllib.urlopen(url).read()
        maxpage = gallery.find("maxPage:")
        pagecount = gallery[maxpage+8:gallery.find(",", maxpage)].replace(' ','')
        baseUrl = gallery.find("baseURL:")
        url = "http://www.imgur.com"+gallery[baseUrl+8:gallery.find(",", baseUrl)].replace(' ','').replace("'",'')
        galleryname = gallery[baseUrl+8:gallery.find(",", baseUrl)].replace(' ','').replace('/','').replace("'",'')
        galleryimages = [galleryname]

        for page in range(eval(pagecount)):
            if url[-1:] == "/":
                xmlurl = url + "hot/page/"+str(page)+".xml"
            else:
                xmlurl = url + "/hot/page/"+str(page)+".xml"

            xml = urllib.urlopen(xmlurl).read()

            print "Page %s" % page

            last = 0

            xml.count("/hash")

            while 1:
                hash = xml.find("<hash>", last)

                if hash == -1:
                    break

                link =  xml[ hash+6: xml.find("</", hash) ] 

                extPos = xml.find("<ext>", hash)
                ext = xml[ extPos+5 : xml.find("</", extPos) ] 

                galleryimages.append( "http://i.imgur.com/"+link+ext )

                last = hash+1

        return galleryimages

def numberOfCPUs():
    """ Determines the number of virtual or physical CPUs on this system.
        Function courtesy of phihag <phihag@phihag.de>
        See: http://stackoverflow.com/questions/1006289/how-to-find-out-the-number-of-cpus-in-python
    """

    # Python 2.6+
    try:
        import multiprocessing
        return multiprocessing.cpu_count()
    except (ImportError, NotImplementedError):
        pass

    # POSIX
    try:
        res = int(os.sysconf('SC_NPROCESSORS_ONLN'))

        if res > 0:
            return res
    except (AttributeError, ValueError):
        pass

    # Windows (eww...)
    try:
        res = int(os.environ['NUMBER_OF_PROCESSORS'])
        if res > 0:
            return res
    except (KeyError, ValueError):
        pass

    # Jython
    try:
        from java.lang import Runtime
        runtime = Runtime.getRuntime()
        res = runtime.availableProcessors()
        if res > 0:
            return res
    except ImportError:
        pass

    # BSD
    try:
        sysctl = subprocess.Popen(['sysctl', '-n', 'hw.cpu'], stdout=subprocess.PIPE)
        scStdout = sysctl.communicate()[0]
        res = int(scStdout)

        if res > 0:
            return res
    except (OSError, ValueError):
        pass

    # Linux
    try:
        res = open('/proc/cpuinfo').read().count('processor\t:')
        if res > 0:
            return res
    except IOError:
        pass

    # Solaris
    try:
        pseudoDevices = os.listdir('/devices/pseudo/')
        expr = re.compile('^cpuid@[0-9]+$')
        res = 0
        for device in pseudoDevices:
            if expr.match(device) != None:
                res += 1

        if res > 0:
            return res
    except OSError:
        pass

    # Other Unices (heuristic)
    try:
        try:
            dmesg = open('/var/run/dmesg.boot').read()
        except IOError:
            dmesgProcess = subprocess.Popen(['dmesg'], stdout=subprocess.PIPE)
            dmesg = dmesgProcess.communicate()[0]

        res = 0
        while '\ncpu' + str(res) + ':' in dmesg:
            res += 1

        if res > 0:
            return res
    except OSError:
        pass

    # If we can't determine the number of CPUs, default to one
    return 1

def main():
    """ Core downloader function """

    # Dispatch queue
    queue = Queue.Queue() 

    # Get user input
    url = raw_input('imgur.com gallery URL: ')

    # Parse the imgur gallery/album/subdomain page
    # into an ImgurAlbum object
    p = PageParser(url)
    album = p.parse()

    # Scale the number of worker threads to the
    # the smaller of (number of images in album, number of CPUs)
    threads = min(len(album.imageList), numberOfCPUs())

    start = time.time()
    print "Fetching '%s' (%d images)" % (album.name, len(album.imageList))
    print "Downloading with %d threads..." % threads

    # Spin up the desired number of worker threads
    for i in range(threads):
        dt = DownloadThread(album.name, queue)
        dt.setDaemon(True)
        dt.start()

    # Pour the images into the dispatch queue
    # to start our work...
    album.enqueueImages(queue)

    # block until queue is empty
    queue.join()
    print "\n"
    print "DONE! Elapsed time: %.2f seconds" % (time.time() - start)

if __name__ == "__main__":
    main()
Posted in development | Tagged , , |

bcrypt_objc: Command Line Utilities in Objective-C

About a week ago, I advised Rogelio Gundino on Twitter that it was not only possible, but pretty fun to write command-line apps on OS X in Objective-C. I promised him that I’d make a blog post about it, and started drafting examples. The problem with sample code, of course, is that it’s sample code. There is a world of difference between:

  1. code written for an API presentation or tech demo
  2. code written to actually perform a useful task

Since I like to read (and write) the latter, I threw out three attempts at useful demonstration before I stumbled across Marco Arment’s clean-room implementation of the bcrypt cryptographic hash algorithm for PHP 5.3. Cryptography gets any developer nerd’s juices flowing, and I began to consider what it would take to port bcrypt to Objective-C. Continue reading

Posted in development | Tagged , , , , , , , |

Left is Right

In a 2005 Pimp My Code post, Wil Shipley challenged the Objective-C idiom for class instantiation. The traditional method is something like this:

- (id)init
{
  if ((self = [super init])) {
    // Success!
    // Initialize your ivars, etc.
  }
  return self;
}

For a number of reasons, Wil pushed a different idiom:

- (id)init
{
  if (! (self = [super init]))
    return nil;

  // Success!
  // Initialize your ivars, etc.

  return self;
}

This felt strange to me until I realized that it looks an awful lot like other conditional blocks of code I’ve written. Take this snippet from my MailQueue project, for example:

public function addMailRecipient($mailRecipient)
{
  if (! is_a($mailRecipient, 'MailRecipient'))
    // Failure: Improper type
    return FALSE;

  if ($this->addressExists($mailRecipient->address()))
    // Failure: email address is already on the recipient list
    return FALSE;

  // Success: Add recipient to recipient list
  $this->_list[] = $mailRecipient;
  return TRUE;
}

What this function and Wil’s constructor idiom have in common is that the normal flow of control follows the left margin of the code. This is one of those little things that makes a HUGE difference when you’re writing a project of any non-trivial size (or when you’re working on multiple projects at once in multiple languages.) Try to write your code so that your conditionals test for failures instead of successes. This enables anyone who reads your code (including you!) to more easily follow along.

Posted in development | Tagged , , , |

Unique Build Numbers for XCode 4

With XCode 4′s integration of Git for version control, I found an old post from Marcus Zarra and Matt Long of Cocoa Is My Girlfriend to be very useful. Back in 2008 they built a handy build number script. It builds on Daniel Jalkut’s Perl script, originally written for Subversion, that injects a unique identifier from the latest commit into an application’s Info.plist file on every build. Such unique identifiers are very useful for developers in distinguishing larger public revisions (i.e. version 1.5) from smaller internal revisions (i.e. build 5369f78).

How to arrange your new build phase.

How to arrange your new build phase. (Click to enlarge.)


I’ve written a Python script that does much the same thing for XCode 4. To use this script, create a new “Shell Script Phase” at the end of your target’s build phase chain, just after “Copy Bundle Resources.” Paste the contents of the script into that phase, and be sure to specify “/usr/bin/python” as the shell for the script. The script will edit the Info.plist file of your fresh app build to reflect the latest git revision.

# XCode 4 auto-versioning script for Git
# Inspired by the work of Axel Andersson, Marcus S. Zarra and Matt Long
# http://valthonis.net/u/19

"""
NOTE: Due to its use of build environment variables, this
script will only work from inside XCode's build process!
"""

import os
import csv
from subprocess import Popen, PIPE
from Foundation import NSMutableDictionary

cmd = "/usr/local/bin/git rev-parse --short HEAD" # get the short commit hash from git
build_number = Popen(cmd, shell=True, stdout=PIPE).stdout.read()
info_plist = os.environ['BUILT_PRODUCTS_DIR'] + "/" + os.environ['WRAPPER_NAME'] + "/Info.plist"

# Open the plist and write the short commit hash as the bundle version
plist = NSMutableDictionary.dictionaryWithContentsOfFile_(info_plist)
core_version = csv.reader([plist['CFBundleVersion'].rstrip()], delimiter=" ").next()[0]
full_version = ''.join([core_version, ' build ', build_number])
plist['CFBundleVersion'] = full_version
plist.writeToFile_atomically_(info_plist, 1)

EDIT: Changed script to append build number to project version. The plist should now use the form “1.5 build 5369f78″

EDIT: Fixed a bug where successive builds might repeatedly append build numbers to the core version (resulting in version strings like “1.6 build 5a32f28 build 03b295e” and so on.) As such, the script now requires Python’s built-in csv module. Also, this script is now available via gist.github.

Posted in development | Tagged , , , , , , , |

Unplayed

Shaun Inman devised a rather easy and ingenious way to keep track of one’s various video games.

I do this for a couple reasons:

  • To keep track of games that I’ve heard great things about but never played. There was a period during college and a couple years after where I didn’t own any consoles. I missed out on some great titles and I’m still catching up. The Unplayed.
  • To keep track of games that I’ve started (and may have been distracted from finishing). I’m playing more games lately as research and sometimes stumble across a game so good I forget about the game I was playing for pleasure. The Unbeaten.
  • To keep track of games I’ve put down and why. The Beaten and The Abandoned. I’m human, I like specific genres (eg. jRPGs and metroidvanias) and repeating my mistakes (eg. any Dragon Quest game without Rocket Slime in the title).

You can grab Shaun’s Markdown-powered Unplayed script for use on your own PHP-enabled web host.  I’ve rolled out my own list for your perusal.

I fully expect to be pilloried on Twitter for some of the titles in the Unbeaten list.

Posted in games | Tagged , , , |