So I’ve begun writing the crawler that I’ve wanted to write for months now. Code below. Note: I haven’t tested this version of the code as I’m in the middle of making some modifications.
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
## Simple *booru crawler
## Usage: path\to\script
## Search parameters are in the form Parameter name = Parameter value
## Separate search parameters with ;
## Save File To is the location you want to save to
## Location must be an existing absolute path ending in \
## Feature Wishlist:
## 1. Multi-threading [done]
## 2. Hash Caching to DB
## 3. Error detection on user input
## 4. Custom Filename Nomenclature
## 5. GUI?
## 6. MAX limit, search for all images of a given tag
## 7. Status printing while downloading
## 8. check for absolute path [done]
import traceback
import urllib
import urllib2
from json import JSONDecoder as Decoder
import os
import threading
from threading import Thread
import Queue
import sys
import hashlib
url = 'http://konachan.com/post/index.json'
queue = Queue.Queue()
input_parameters = {}
############ text input ####################################
## parameters = raw_input('Search Parameters:')
## folder_path = raw_input('Save File To:')
## add default path if path left blank: %user%/Downloads/%tags%
parameters = 'tags = mikagami_mamizu ; limit = 1'
folder_path = str('C:\Users\Cirno\Downloads\Danbooru Images')
## splits the parameters, strips whitespace, assigns dict
splits = parameters.split(";")
for split in splits:
split = split.strip()
subsplits = split.split('=')
key = subsplits[0].strip()
value = subsplits[1].strip()
input_parameters[key] = value
input_parameters['limit'] = int(input_parameters['limit'])
## Normalizes folder path structure and converts it to absolute path if it is not
folder_path = os.path.normpath(folder_path)
folder_path = os.path.abspath(folder_path)
## checks if path exists, if not, creates it
if os.path.exists(folder_path) == False:
os.makedirs(folder_path)
## encodes data in url readable format, builds manual request
## opens page, reads response and decodes JSON
request_data = urllib.urlencode(input_parameters)
req = urllib2.Request(url, request_data)
response = urllib2.urlopen(req)
response_data = response.read()
query_results = Decoder().decode(response_data)
## takes results and sets a variable to values I want to use later
for result in query_results:
md5 = result['md5']
file_url = result['file_url']
file_tags = result['tags']
folder = str(folder_path)
file_extension = str(file_url)[-4:]
file_name = md5 + file_extension
file_path = os.path.join(folder_path, file_name)
def hash_sum(file_path):
file_hash_temp = hashlib.md5()
with open(file_path, 'rb') as file_to_be_checked:
for chunk in iter(lambda: file_to_be_checked.read(8192), ''):
file_hash_temp.update(chunk)
return file_hash_temp.hexdigest()
## checks if file exists, if it does skips then appends
## list with file_url, file_path tuple
if os.path.isfile(file_path):
print 'File Exists, Moving On!'
continue
queue.put((file_url, file_path, md5))
## function for retrieving data from server
## also checks and compares file md5sum after download completion
def fetch_url(url_md5_and_path_tuple):
file_url, file_path, md5 = url_md5_and_path_tuple
urllib.urlretrieve(file_url, file_path)
## future note, make hashing of file separate function callable elsewhere
## so that I can check hashes before downloading
if os.path.exists(file_path) and md5 == hash_sum(file_path):
print 'File Exists, md5sum Verified!'
if hash_sum(file_path) != md5:
print 'MD5 SUM DOES NOT MATCH! ' + 'Site MD5: ' + md5 \
+ ' On Disk MD5: ' + hash_sum(file_path)
else:
print 'MD5 SUMS MATCH! Resulting Sum: ' + hash_sum(file_path)
## class for passing queue to thread
class url_download(threading.Thread):
def __init__(self, queue):
self.queue = queue
Thread.__init__(self)
def run(self):
while 1:
try:
fetch_url(self.queue.get_nowait())
except Queue.Empty:
raise SystemExit
except:
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
num_conn =
threads = []
for download in range(num_conn):
t = url_download(queue)
t.start()
threads.append(t)
for thread in threads:
thread.join() |




0 Responses.