## Simple *booru crawler
## Usage: path\to\script
## Search parameters are in the form Parameter name = Parameter value
## Separate search parameters with ;
## Save File To is the location you want to save to
## Location must be an existing absolute path ending in \
## Feature Wishlist:
## 1. Multi-threading [done]
## 2. Hash Caching to DB
## 3. Error detection on user input
## 4. Custom Filename Nomenclature
## 5. GUI?
## 6. MAX limit, search for all images of a given tag
## 7. Status printing while downloading
## 8. check for absolute path [done]
import traceback
import urllib
import urllib2
from json import JSONDecoder as Decoder
import os
import threading
from threading import Thread
import Queue
import sys
import hashlib
url = 'http://konachan.com/post/index.json'
queue = Queue.Queue()
input_parameters = {}
############ text input ####################################
## parameters = raw_input('Search Parameters:')
## folder_path = raw_input('Save File To:')
## add default path if path left blank: %user%/Downloads/%tags%
parameters = 'tags = mikagami_mamizu ; limit = 1'
folder_path = str('C:\Users\Cirno\Downloads\Danbooru Images')
## splits the parameters, strips whitespace, assigns dict
splits = parameters.split(";")
for split in splits:
split = split.strip()
subsplits = split.split('=')
key = subsplits[0].strip()
value = subsplits[1].strip()
input_parameters[key] = value
input_parameters['limit'] = int(input_parameters['limit'])
## Normalizes folder path structure and converts it to absolute path if it is not
folder_path = os.path.normpath(folder_path)
folder_path = os.path.abspath(folder_path)
## checks if path exists, if not, creates it
if os.path.exists(folder_path) == False:
os.makedirs(folder_path)
## encodes data in url readable format, builds manual request
## opens page, reads response and decodes JSON
request_data = urllib.urlencode(input_parameters)
req = urllib2.Request(url, request_data)
response = urllib2.urlopen(req)
response_data = response.read()
query_results = Decoder().decode(response_data)
## takes results and sets a variable to values I want to use later
for result in query_results:
md5 = result['md5']
file_url = result['file_url']
file_tags = result['tags']
folder = str(folder_path)
file_extension = str(file_url)[-4:]
file_name = md5 + file_extension
file_path = os.path.join(folder_path, file_name)
def hash_sum(file_path):
file_hash_temp = hashlib.md5()
with open(file_path, 'rb') as file_to_be_checked:
for chunk in iter(lambda: file_to_be_checked.read(8192), ''):
file_hash_temp.update(chunk)
return file_hash_temp.hexdigest()
## checks if file exists, if it does skips then appends
## list with file_url, file_path tuple
if os.path.isfile(file_path):
print 'File Exists, Moving On!'
continue
queue.put((file_url, file_path, md5))
## function for retrieving data from server
## also checks and compares file md5sum after download completion
def fetch_url(url_md5_and_path_tuple):
file_url, file_path, md5 = url_md5_and_path_tuple
urllib.urlretrieve(file_url, file_path)
## future note, make hashing of file separate function callable elsewhere
## so that I can check hashes before downloading
if os.path.exists(file_path) and md5 == hash_sum(file_path):
print 'File Exists, md5sum Verified!'
if hash_sum(file_path) != md5:
print 'MD5 SUM DOES NOT MATCH! ' + 'Site MD5: ' + md5 \
+ ' On Disk MD5: ' + hash_sum(file_path)
else:
print 'MD5 SUMS MATCH! Resulting Sum: ' + hash_sum(file_path)
## class for passing queue to thread
class url_download(threading.Thread):
def __init__(self, queue):
self.queue = queue
Thread.__init__(self)
def run(self):
while 1:
try:
fetch_url(self.queue.get_nowait())
except Queue.Empty:
raise SystemExit
except:
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
num_conn =
threads = []
for download in range(num_conn):
t = url_download(queue)
t.start()
threads.append(t)
for thread in threads:
thread.join()