Undo my camelCase mistake

This commit is contained in:
Ivan Kozik 2015-07-28 12:51:59 +00:00
parent 4eb2805df0
commit 4c84312462

View File

@ -9,33 +9,36 @@ from urllib.request import urlopen
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
from libgrabsite.ignoracle import Ignoracle, parameterize_record_info from libgrabsite.ignoracle import Ignoracle, parameterize_record_info
realStdoutWrite = sys.stdout.buffer.write real_stdout_write = sys.stdout.buffer.write
realStderrWrite = sys.stderr.buffer.write real_stderr_write = sys.stderr.buffer.write
def printToReal(s): def print_to_real(s):
realStdoutWrite((s + "\n").encode("utf-8")) real_stdout_write((s + "\n").encode("utf-8"))
sys.stdout.buffer.flush() sys.stdout.buffer.flush()
class GrabberClientProtocol(WebSocketClientProtocol): class GrabberClientProtocol(WebSocketClientProtocol):
def onOpen(self): def on_open(self):
self.factory.client = self self.factory.client = self
self.sendMessage(json.dumps({ self.send_object({
"type": "hello", "type": "hello",
"mode": "grabber", "mode": "grabber",
"url": jobData["url"] "url": job_data["url"]
}).encode('utf-8')) })
def onClose(self, wasClean, code, reason): def on_close(self, was_clean, code, reason):
self.factory.client = None self.factory.client = None
printToReal( print_to_real(
"Disconnected from ws:// server with (wasClean, code, reason): {!r}" "Disconnected from ws:// server with (was_clean, code, reason): {!r}"
.format((wasClean, code, reason))) .format((was_clean, code, reason)))
asyncio.ensure_future(connectToServer()) asyncio.ensure_future(connect_to_server())
def sendObject(self, obj): def send_object(self, obj):
self.sendMessage(json.dumps(obj).encode("utf-8")) self.sendMessage(json.dumps(obj).encode("utf-8"))
onOpen = on_open
onClose = on_close
class GrabberClientFactory(WebSocketClientFactory): class GrabberClientFactory(WebSocketClientFactory):
protocol = GrabberClientProtocol protocol = GrabberClientProtocol
@ -45,7 +48,7 @@ class GrabberClientFactory(WebSocketClientFactory):
self.client = None self.client = None
wsFactory = GrabberClientFactory() ws_factory = GrabberClientFactory()
class Decayer(object): class Decayer(object):
def __init__(self, initial, multiplier, maximum): def __init__(self, initial, multiplier, maximum):
@ -71,51 +74,51 @@ class Decayer(object):
@asyncio.coroutine @asyncio.coroutine
def connectToServer(): def connect_to_server():
host = os.environ.get('GRAB_SITE_WS_HOST', '127.0.0.1') host = os.environ.get('GRAB_SITE_WS_HOST', '127.0.0.1')
port = int(os.environ.get('GRAB_SITE_WS_PORT', 29001)) port = int(os.environ.get('GRAB_SITE_WS_PORT', 29001))
decayer = Decayer(0.25, 1.5, 8) decayer = Decayer(0.25, 1.5, 8)
while True: while True:
try: try:
coro = yield from loop.create_connection(wsFactory, host, port) coro = yield from loop.create_connection(ws_factory, host, port)
except OSError: except OSError:
delay = decayer.decay() delay = decayer.decay()
printToReal( print_to_real(
"Could not connect to ws://{}:{}, retrying in {:.1f} seconds..." "Could not connect to ws://{}:{}, retrying in {:.1f} seconds..."
.format(host, port, delay)) .format(host, port, delay))
yield from asyncio.sleep(delay) yield from asyncio.sleep(delay)
else: else:
printToReal("Connected to ws://{}:{}".format(host, port)) print_to_real("Connected to ws://{}:{}".format(host, port))
break break
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
asyncio.ensure_future(connectToServer()) asyncio.ensure_future(connect_to_server())
def gracefulStopCallback(): def graceful_stop_callback():
printToReal("\n^C detected, creating 'stop' file, please wait for exit...") print_to_real("\n^C detected, creating 'stop' file, please wait for exit...")
with open(os.path.join(workingDir, "stop"), "wb") as f: with open(os.path.join(working_dir, "stop"), "wb") as f:
pass pass
def forcefulStopCallback(): def forceful_stop_callback():
loop.stop() loop.stop()
loop.add_signal_handler(signal.SIGINT, gracefulStopCallback) loop.add_signal_handler(signal.SIGINT, graceful_stop_callback)
loop.add_signal_handler(signal.SIGTERM, forcefulStopCallback) loop.add_signal_handler(signal.SIGTERM, forceful_stop_callback)
igsetCache = {} igset_cache = {}
def getPatternsForIgnoreSet(name): def get_patterns_for_ignore_set(name):
assert name != "", name assert name != "", name
if name in igsetCache: if name in igset_cache:
return igsetCache[name] return igset_cache[name]
printToReal("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name) print_to_real("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
igsetCache[name] = json.loads(urlopen( igset_cache[name] = json.loads(urlopen(
"https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/" + "https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/" +
"master/db/ignore_patterns/%s.json" % name).read().decode("utf-8") "master/db/ignore_patterns/%s.json" % name).read().decode("utf-8")
)["patterns"] )["patterns"]
return igsetCache[name] return igset_cache[name]
workingDir = os.environ['GRAB_SITE_WORKING_DIR'] working_dir = os.environ['GRAB_SITE_WORKING_DIR']
def mtime(f): def mtime(f):
return os.stat(f).st_mtime return os.stat(f).st_mtime
@ -124,83 +127,83 @@ def mtime(f):
class FileChangedWatcher(object): class FileChangedWatcher(object):
def __init__(self, fname): def __init__(self, fname):
self.fname = fname self.fname = fname
self.lastModificationTime = mtime(fname) self.last_mtime = mtime(fname)
def hasChanged(self): def has_changed(self):
nowModificationTime = mtime(self.fname) now_mtime = mtime(self.fname)
changed = mtime(self.fname) != self.lastModificationTime changed = mtime(self.fname) != self.last_mtime
self.lastModificationTime = nowModificationTime self.last_mtime = now_mtime
return changed return changed
igsetsWatcher = FileChangedWatcher(os.path.join(workingDir, "igsets")) igsets_watcher = FileChangedWatcher(os.path.join(working_dir, "igsets"))
ignoresWatcher = FileChangedWatcher(os.path.join(workingDir, "ignores")) ignores_watcher = FileChangedWatcher(os.path.join(working_dir, "ignores"))
ignoracle = Ignoracle() ignoracle = Ignoracle()
def updateIgnoracle(): def update_ignoracle():
with open(os.path.join(workingDir, "igsets"), "r") as f: with open(os.path.join(working_dir, "igsets"), "r") as f:
igsets = f.read().strip("\r\n\t ,").split(',') igsets = f.read().strip("\r\n\t ,").split(',')
with open(os.path.join(workingDir, "ignores"), "r") as f: with open(os.path.join(working_dir, "ignores"), "r") as f:
ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "") ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "")
for igset in igsets: for igset in igsets:
patterns = getPatternsForIgnoreSet(igset) patterns = get_patterns_for_ignore_set(igset)
if igset == "global": if igset == "global":
patterns = filter(lambda p: "archive\\.org" not in p, patterns) patterns = filter(lambda p: "archive\\.org" not in p, patterns)
ignores.update(patterns) ignores.update(patterns)
printToReal("Using these %d ignores:" % len(ignores)) print_to_real("Using these %d ignores:" % len(ignores))
printToReal(pprint.pformat(ignores)) print_to_real(pprint.pformat(ignores))
ignoracle.set_patterns(ignores) ignoracle.set_patterns(ignores)
updateIgnoracle() update_ignoracle()
def shouldIgnoreURL(url, recordInfo): def should_ignore_url(url, record_info):
""" """
Returns whether a URL should be ignored. Returns whether a URL should be ignored.
""" """
parameters = parameterize_record_info(recordInfo) parameters = parameterize_record_info(record_info)
return ignoracle.ignores(url, **parameters) return ignoracle.ignores(url, **parameters)
def acceptURL(urlInfo, recordInfo, verdict, reasons): def accept_url(url_info, record_info, verdict, reasons):
if igsetsWatcher.hasChanged() or ignoresWatcher.hasChanged(): if igsets_watcher.has_changed() or ignores_watcher.has_changed():
updateIgnoracle() update_ignoracle()
url = urlInfo['url'] url = url_info['url']
if url.startswith('data:'): if url.startswith('data:'):
# data: URLs aren't something you can grab, so drop them to avoid ignore # data: URLs aren't something you can grab, so drop them to avoid ignore
# checking and ignore logging. # checking and ignore logging.
return False return False
pattern = shouldIgnoreURL(url, recordInfo) pattern = should_ignore_url(url, record_info)
if pattern: if pattern:
maybeLogIgnore(url, pattern) maybe_log_ignore(url, pattern)
return False return False
# If we get here, none of our ignores apply. Return the original verdict. # If we get here, none of our ignores apply. Return the original verdict.
return verdict return verdict
def queuedURL(urlInfo): def queued_url(url_info):
jobData["items_queued"] += 1 job_data["items_queued"] += 1
def dequeuedURL(urlInfo, recordInfo): def dequeued_url(url_info, record_info):
jobData["items_downloaded"] += 1 job_data["items_downloaded"] += 1
jobData = { job_data = {
"ident": open(os.path.join(workingDir, "id")).read().strip(), "ident": open(os.path.join(working_dir, "id")).read().strip(),
"url": open(os.path.join(workingDir, "start_url")).read().strip(), "url": open(os.path.join(working_dir, "start_url")).read().strip(),
"started_at": os.stat(os.path.join(workingDir, "start_url")).st_mtime, "started_at": os.stat(os.path.join(working_dir, "start_url")).st_mtime,
"suppress_ignore_reports": True, "suppress_ignore_reports": True,
"concurrency": int(open(os.path.join(workingDir, "concurrency")).read().strip()), "concurrency": int(open(os.path.join(working_dir, "concurrency")).read().strip()),
"bytes_downloaded": 0, "bytes_downloaded": 0,
"items_queued": 0, "items_queued": 0,
"items_downloaded": 0, "items_downloaded": 0,
@ -214,38 +217,38 @@ jobData = {
"runk": 0, "runk": 0,
} }
def handleResult(urlInfo, recordInfo, errorInfo={}, httpInfo={}): def handle_result(url_info, record_info, error_info={}, http_info={}):
#print("urlInfo", urlInfo) #print("url_info", url_info)
#print("recordInfo", recordInfo) #print("record_info", record_info)
#print("errorInfo", errorInfo) #print("error_info", error_info)
#print("httpInfo", httpInfo) #print("http_info", http_info)
updateIgoffInJobData() update_igoff_in_job_data()
response_code = 0 response_code = 0
if httpInfo.get("response_code"): if http_info.get("response_code"):
response_code = httpInfo.get("response_code") response_code = http_info.get("response_code")
response_code_str = str(httpInfo["response_code"]) response_code_str = str(http_info["response_code"])
if len(response_code_str) == 3 and response_code_str[0] in "12345": if len(response_code_str) == 3 and response_code_str[0] in "12345":
jobData["r%sxx" % response_code_str[0]] += 1 job_data["r%sxx" % response_code_str[0]] += 1
else: else:
jobData["runk"] += 1 job_data["runk"] += 1
if httpInfo.get("body"): if http_info.get("body"):
jobData["bytes_downloaded"] += httpInfo["body"]["content_size"] job_data["bytes_downloaded"] += http_info["body"]["content_size"]
stop = shouldStop() stop = should_stop()
response_message = httpInfo.get("response_message") response_message = http_info.get("response_message")
if errorInfo: if error_info:
response_code = 0 response_code = 0
response_message = errorInfo["error"] response_message = error_info["error"]
if wsFactory.client: if ws_factory.client:
wsFactory.client.sendObject({ ws_factory.client.send_object({
"type": "download", "type": "download",
"job_data": jobData, "job_data": job_data,
"url": urlInfo["url"], "url": url_info["url"],
"response_code": response_code, "response_code": response_code,
"response_message": response_message, "response_message": response_message,
}) })
@ -256,33 +259,33 @@ def handleResult(urlInfo, recordInfo, errorInfo={}, httpInfo={}):
return wpull_hook.actions.NORMAL return wpull_hook.actions.NORMAL
def handleResponse(urlInfo, recordInfo, httpInfo): def handle_response(url_info, record_info, http_info):
return handleResult(urlInfo, recordInfo, httpInfo=httpInfo) return handle_result(url_info, record_info, http_info=http_info)
def handleError(urlInfo, recordInfo, errorInfo): def handle_error(url_info, record_info, error_info):
return handleResult(urlInfo, recordInfo, errorInfo=errorInfo) return handle_result(url_info, record_info, error_info=error_info)
# TODO: check only every 5 seconds max # TODO: check only every 5 seconds max
def shouldStop(): def should_stop():
return os.path.exists(os.path.join(workingDir, "stop")) return os.path.exists(os.path.join(working_dir, "stop"))
# TODO: check only every 5 seconds max # TODO: check only every 5 seconds max
def updateIgoffInJobData(): def update_igoff_in_job_data():
igoff = os.path.exists(os.path.join(workingDir, "igoff")) igoff = os.path.exists(os.path.join(working_dir, "igoff"))
jobData["suppress_ignore_reports"] = igoff job_data["suppress_ignore_reports"] = igoff
return igoff return igoff
def maybeLogIgnore(url, pattern): def maybe_log_ignore(url, pattern):
if not updateIgoffInJobData(): if not update_igoff_in_job_data():
printToReal("IGNOR %s by %s" % (url, pattern)) print_to_real("IGNOR %s by %s" % (url, pattern))
if wsFactory.client: if ws_factory.client:
wsFactory.client.sendObject({ ws_factory.client.send_object({
"type": "ignore", "type": "ignore",
"job_data": jobData, "job_data": job_data,
"url": url, "url": url,
"pattern": pattern "pattern": pattern
}) })
@ -292,74 +295,75 @@ def maybeLogIgnore(url, pattern):
ICY_FIELD_PATTERN = re.compile('icy-|ice-|x-audiocast-', re.IGNORECASE) ICY_FIELD_PATTERN = re.compile('icy-|ice-|x-audiocast-', re.IGNORECASE)
ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE) ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE)
def handlePreResponse(urlInfo, urlRecord, responseInfo): def handle_pre_response(url_info, url_record, response_info):
url = urlInfo['url'] url = url_info['url']
# Check if server version starts with ICY # Check if server version starts with ICY
if responseInfo.get('version', '') == 'ICY': if response_info.get('version', '') == 'ICY':
maybeLogIgnore(url, '[icy version]') maybe_log_ignore(url, '[icy version]')
return wpull_hook.actions.FINISH return wpull_hook.actions.FINISH
# Loop through all the server headers for matches # Loop through all the server headers for matches
for field, value in responseInfo.get('fields', []): for field, value in response_info.get('fields', []):
if ICY_FIELD_PATTERN.match(field): if ICY_FIELD_PATTERN.match(field):
maybeLogIgnore(url, '[icy field]') maybe_log_ignore(url, '[icy field]')
return wpull_hook.actions.FINISH return wpull_hook.actions.FINISH
if field == 'Server' and ICY_VALUE_PATTERN.match(value): if field == 'Server' and ICY_VALUE_PATTERN.match(value):
maybeLogIgnore(url, '[icy server]') maybe_log_ignore(url, '[icy server]')
return wpull_hook.actions.FINISH return wpull_hook.actions.FINISH
# Nothing matched, allow download # Nothing matched, allow download
printToReal(url + " ...") print_to_real(url + " ...")
return wpull_hook.actions.NORMAL return wpull_hook.actions.NORMAL
def stdoutWriteToBoth(message): def stdout_write_both(message):
assert isinstance(message, bytes), message assert isinstance(message, bytes), message
try: try:
realStdoutWrite(message) real_stdout_write(message)
if wsFactory.client: if ws_factory.client:
wsFactory.client.sendObject({ ws_factory.client.send_object({
"type": "stdout", "type": "stdout",
"job_data": jobData, "job_data": job_data,
"message": message.decode("utf-8") "message": message.decode("utf-8")
}) })
except Exception as e: except Exception as e:
realStderrWrite((str(e) + "\n").encode("utf-8")) real_stderr_write((str(e) + "\n").encode("utf-8"))
def stderrWriteToBoth(message): def stderr_write_both(message):
assert isinstance(message, bytes), message assert isinstance(message, bytes), message
try: try:
realStderrWrite(message) real_stderr_write(message)
if wsFactory.client: if ws_factory.client:
wsFactory.client.sendObject({ ws_factory.client.send_object({
"type": "stderr", "type": "stderr",
"job_data": jobData, "job_data": job_data,
"message": message.decode("utf-8") "message": message.decode("utf-8")
}) })
except Exception as e: except Exception as e:
realStderrWrite((str(e) + "\n").encode("utf-8")) real_stderr_write((str(e) + "\n").encode("utf-8"))
sys.stdout.buffer.write = stdoutWriteToBoth sys.stdout.buffer.write = stdout_write_both
sys.stderr.buffer.write = stderrWriteToBoth sys.stderr.buffer.write = stderr_write_both
def exitStatus(code): def exit_status(code):
print() print()
print("Finished grab {} {} with exit code {}".format(jobData["ident"], jobData["url"], code)) print("Finished grab {} {} with exit code {}".format(
print("Output is in directory:\n{}".format(workingDir)) job_data["ident"], job_data["url"], code))
print("Output is in directory:\n{}".format(working_dir))
return code return code
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
wpull_hook.callbacks.version = 2 wpull_hook.callbacks.version = 2
wpull_hook.callbacks.accept_url = acceptURL wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.queued_url = queuedURL wpull_hook.callbacks.queued_url = queued_url
wpull_hook.callbacks.dequeued_url = dequeuedURL wpull_hook.callbacks.dequeued_url = dequeued_url
wpull_hook.callbacks.handle_response = handleResponse wpull_hook.callbacks.handle_response = handle_response
wpull_hook.callbacks.handle_error = handleError wpull_hook.callbacks.handle_error = handle_error
wpull_hook.callbacks.handle_pre_response = handlePreResponse wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.exit_status = exitStatus wpull_hook.callbacks.exit_status = exit_status