Undo my camelCase mistake
This commit is contained in:
parent
4eb2805df0
commit
4c84312462
@ -9,33 +9,36 @@ from urllib.request import urlopen
|
||||
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
|
||||
from libgrabsite.ignoracle import Ignoracle, parameterize_record_info
|
||||
|
||||
realStdoutWrite = sys.stdout.buffer.write
|
||||
realStderrWrite = sys.stderr.buffer.write
|
||||
real_stdout_write = sys.stdout.buffer.write
|
||||
real_stderr_write = sys.stderr.buffer.write
|
||||
|
||||
def printToReal(s):
|
||||
realStdoutWrite((s + "\n").encode("utf-8"))
|
||||
def print_to_real(s):
|
||||
real_stdout_write((s + "\n").encode("utf-8"))
|
||||
sys.stdout.buffer.flush()
|
||||
|
||||
|
||||
class GrabberClientProtocol(WebSocketClientProtocol):
|
||||
def onOpen(self):
|
||||
def on_open(self):
|
||||
self.factory.client = self
|
||||
self.sendMessage(json.dumps({
|
||||
self.send_object({
|
||||
"type": "hello",
|
||||
"mode": "grabber",
|
||||
"url": jobData["url"]
|
||||
}).encode('utf-8'))
|
||||
"url": job_data["url"]
|
||||
})
|
||||
|
||||
def onClose(self, wasClean, code, reason):
|
||||
def on_close(self, was_clean, code, reason):
|
||||
self.factory.client = None
|
||||
printToReal(
|
||||
"Disconnected from ws:// server with (wasClean, code, reason): {!r}"
|
||||
.format((wasClean, code, reason)))
|
||||
asyncio.ensure_future(connectToServer())
|
||||
print_to_real(
|
||||
"Disconnected from ws:// server with (was_clean, code, reason): {!r}"
|
||||
.format((was_clean, code, reason)))
|
||||
asyncio.ensure_future(connect_to_server())
|
||||
|
||||
def sendObject(self, obj):
|
||||
def send_object(self, obj):
|
||||
self.sendMessage(json.dumps(obj).encode("utf-8"))
|
||||
|
||||
onOpen = on_open
|
||||
onClose = on_close
|
||||
|
||||
|
||||
class GrabberClientFactory(WebSocketClientFactory):
|
||||
protocol = GrabberClientProtocol
|
||||
@ -45,7 +48,7 @@ class GrabberClientFactory(WebSocketClientFactory):
|
||||
self.client = None
|
||||
|
||||
|
||||
wsFactory = GrabberClientFactory()
|
||||
ws_factory = GrabberClientFactory()
|
||||
|
||||
class Decayer(object):
|
||||
def __init__(self, initial, multiplier, maximum):
|
||||
@ -71,51 +74,51 @@ class Decayer(object):
|
||||
|
||||
|
||||
@asyncio.coroutine
|
||||
def connectToServer():
|
||||
def connect_to_server():
|
||||
host = os.environ.get('GRAB_SITE_WS_HOST', '127.0.0.1')
|
||||
port = int(os.environ.get('GRAB_SITE_WS_PORT', 29001))
|
||||
decayer = Decayer(0.25, 1.5, 8)
|
||||
while True:
|
||||
try:
|
||||
coro = yield from loop.create_connection(wsFactory, host, port)
|
||||
coro = yield from loop.create_connection(ws_factory, host, port)
|
||||
except OSError:
|
||||
delay = decayer.decay()
|
||||
printToReal(
|
||||
print_to_real(
|
||||
"Could not connect to ws://{}:{}, retrying in {:.1f} seconds..."
|
||||
.format(host, port, delay))
|
||||
yield from asyncio.sleep(delay)
|
||||
else:
|
||||
printToReal("Connected to ws://{}:{}".format(host, port))
|
||||
print_to_real("Connected to ws://{}:{}".format(host, port))
|
||||
break
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
asyncio.ensure_future(connectToServer())
|
||||
asyncio.ensure_future(connect_to_server())
|
||||
|
||||
def gracefulStopCallback():
|
||||
printToReal("\n^C detected, creating 'stop' file, please wait for exit...")
|
||||
with open(os.path.join(workingDir, "stop"), "wb") as f:
|
||||
def graceful_stop_callback():
|
||||
print_to_real("\n^C detected, creating 'stop' file, please wait for exit...")
|
||||
with open(os.path.join(working_dir, "stop"), "wb") as f:
|
||||
pass
|
||||
|
||||
def forcefulStopCallback():
|
||||
def forceful_stop_callback():
|
||||
loop.stop()
|
||||
|
||||
loop.add_signal_handler(signal.SIGINT, gracefulStopCallback)
|
||||
loop.add_signal_handler(signal.SIGTERM, forcefulStopCallback)
|
||||
loop.add_signal_handler(signal.SIGINT, graceful_stop_callback)
|
||||
loop.add_signal_handler(signal.SIGTERM, forceful_stop_callback)
|
||||
|
||||
|
||||
igsetCache = {}
|
||||
def getPatternsForIgnoreSet(name):
|
||||
igset_cache = {}
|
||||
def get_patterns_for_ignore_set(name):
|
||||
assert name != "", name
|
||||
if name in igsetCache:
|
||||
return igsetCache[name]
|
||||
printToReal("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
|
||||
igsetCache[name] = json.loads(urlopen(
|
||||
if name in igset_cache:
|
||||
return igset_cache[name]
|
||||
print_to_real("Fetching ArchiveBot/master/db/ignore_patterns/%s.json" % name)
|
||||
igset_cache[name] = json.loads(urlopen(
|
||||
"https://raw.githubusercontent.com/ArchiveTeam/ArchiveBot/" +
|
||||
"master/db/ignore_patterns/%s.json" % name).read().decode("utf-8")
|
||||
)["patterns"]
|
||||
return igsetCache[name]
|
||||
return igset_cache[name]
|
||||
|
||||
workingDir = os.environ['GRAB_SITE_WORKING_DIR']
|
||||
working_dir = os.environ['GRAB_SITE_WORKING_DIR']
|
||||
|
||||
def mtime(f):
|
||||
return os.stat(f).st_mtime
|
||||
@ -124,83 +127,83 @@ def mtime(f):
|
||||
class FileChangedWatcher(object):
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
self.lastModificationTime = mtime(fname)
|
||||
self.last_mtime = mtime(fname)
|
||||
|
||||
def hasChanged(self):
|
||||
nowModificationTime = mtime(self.fname)
|
||||
changed = mtime(self.fname) != self.lastModificationTime
|
||||
self.lastModificationTime = nowModificationTime
|
||||
def has_changed(self):
|
||||
now_mtime = mtime(self.fname)
|
||||
changed = mtime(self.fname) != self.last_mtime
|
||||
self.last_mtime = now_mtime
|
||||
return changed
|
||||
|
||||
|
||||
igsetsWatcher = FileChangedWatcher(os.path.join(workingDir, "igsets"))
|
||||
ignoresWatcher = FileChangedWatcher(os.path.join(workingDir, "ignores"))
|
||||
igsets_watcher = FileChangedWatcher(os.path.join(working_dir, "igsets"))
|
||||
ignores_watcher = FileChangedWatcher(os.path.join(working_dir, "ignores"))
|
||||
|
||||
ignoracle = Ignoracle()
|
||||
|
||||
def updateIgnoracle():
|
||||
with open(os.path.join(workingDir, "igsets"), "r") as f:
|
||||
def update_ignoracle():
|
||||
with open(os.path.join(working_dir, "igsets"), "r") as f:
|
||||
igsets = f.read().strip("\r\n\t ,").split(',')
|
||||
|
||||
with open(os.path.join(workingDir, "ignores"), "r") as f:
|
||||
with open(os.path.join(working_dir, "ignores"), "r") as f:
|
||||
ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "")
|
||||
|
||||
for igset in igsets:
|
||||
patterns = getPatternsForIgnoreSet(igset)
|
||||
patterns = get_patterns_for_ignore_set(igset)
|
||||
if igset == "global":
|
||||
patterns = filter(lambda p: "archive\\.org" not in p, patterns)
|
||||
ignores.update(patterns)
|
||||
|
||||
printToReal("Using these %d ignores:" % len(ignores))
|
||||
printToReal(pprint.pformat(ignores))
|
||||
print_to_real("Using these %d ignores:" % len(ignores))
|
||||
print_to_real(pprint.pformat(ignores))
|
||||
|
||||
ignoracle.set_patterns(ignores)
|
||||
|
||||
updateIgnoracle()
|
||||
update_ignoracle()
|
||||
|
||||
|
||||
def shouldIgnoreURL(url, recordInfo):
|
||||
def should_ignore_url(url, record_info):
|
||||
"""
|
||||
Returns whether a URL should be ignored.
|
||||
"""
|
||||
parameters = parameterize_record_info(recordInfo)
|
||||
parameters = parameterize_record_info(record_info)
|
||||
return ignoracle.ignores(url, **parameters)
|
||||
|
||||
|
||||
def acceptURL(urlInfo, recordInfo, verdict, reasons):
|
||||
if igsetsWatcher.hasChanged() or ignoresWatcher.hasChanged():
|
||||
updateIgnoracle()
|
||||
def accept_url(url_info, record_info, verdict, reasons):
|
||||
if igsets_watcher.has_changed() or ignores_watcher.has_changed():
|
||||
update_ignoracle()
|
||||
|
||||
url = urlInfo['url']
|
||||
url = url_info['url']
|
||||
|
||||
if url.startswith('data:'):
|
||||
# data: URLs aren't something you can grab, so drop them to avoid ignore
|
||||
# checking and ignore logging.
|
||||
return False
|
||||
|
||||
pattern = shouldIgnoreURL(url, recordInfo)
|
||||
pattern = should_ignore_url(url, record_info)
|
||||
if pattern:
|
||||
maybeLogIgnore(url, pattern)
|
||||
maybe_log_ignore(url, pattern)
|
||||
return False
|
||||
|
||||
# If we get here, none of our ignores apply. Return the original verdict.
|
||||
return verdict
|
||||
|
||||
|
||||
def queuedURL(urlInfo):
|
||||
jobData["items_queued"] += 1
|
||||
def queued_url(url_info):
|
||||
job_data["items_queued"] += 1
|
||||
|
||||
|
||||
def dequeuedURL(urlInfo, recordInfo):
|
||||
jobData["items_downloaded"] += 1
|
||||
def dequeued_url(url_info, record_info):
|
||||
job_data["items_downloaded"] += 1
|
||||
|
||||
|
||||
jobData = {
|
||||
"ident": open(os.path.join(workingDir, "id")).read().strip(),
|
||||
"url": open(os.path.join(workingDir, "start_url")).read().strip(),
|
||||
"started_at": os.stat(os.path.join(workingDir, "start_url")).st_mtime,
|
||||
job_data = {
|
||||
"ident": open(os.path.join(working_dir, "id")).read().strip(),
|
||||
"url": open(os.path.join(working_dir, "start_url")).read().strip(),
|
||||
"started_at": os.stat(os.path.join(working_dir, "start_url")).st_mtime,
|
||||
"suppress_ignore_reports": True,
|
||||
"concurrency": int(open(os.path.join(workingDir, "concurrency")).read().strip()),
|
||||
"concurrency": int(open(os.path.join(working_dir, "concurrency")).read().strip()),
|
||||
"bytes_downloaded": 0,
|
||||
"items_queued": 0,
|
||||
"items_downloaded": 0,
|
||||
@ -214,38 +217,38 @@ jobData = {
|
||||
"runk": 0,
|
||||
}
|
||||
|
||||
def handleResult(urlInfo, recordInfo, errorInfo={}, httpInfo={}):
|
||||
#print("urlInfo", urlInfo)
|
||||
#print("recordInfo", recordInfo)
|
||||
#print("errorInfo", errorInfo)
|
||||
#print("httpInfo", httpInfo)
|
||||
def handle_result(url_info, record_info, error_info={}, http_info={}):
|
||||
#print("url_info", url_info)
|
||||
#print("record_info", record_info)
|
||||
#print("error_info", error_info)
|
||||
#print("http_info", http_info)
|
||||
|
||||
updateIgoffInJobData()
|
||||
update_igoff_in_job_data()
|
||||
|
||||
response_code = 0
|
||||
if httpInfo.get("response_code"):
|
||||
response_code = httpInfo.get("response_code")
|
||||
response_code_str = str(httpInfo["response_code"])
|
||||
if http_info.get("response_code"):
|
||||
response_code = http_info.get("response_code")
|
||||
response_code_str = str(http_info["response_code"])
|
||||
if len(response_code_str) == 3 and response_code_str[0] in "12345":
|
||||
jobData["r%sxx" % response_code_str[0]] += 1
|
||||
job_data["r%sxx" % response_code_str[0]] += 1
|
||||
else:
|
||||
jobData["runk"] += 1
|
||||
job_data["runk"] += 1
|
||||
|
||||
if httpInfo.get("body"):
|
||||
jobData["bytes_downloaded"] += httpInfo["body"]["content_size"]
|
||||
if http_info.get("body"):
|
||||
job_data["bytes_downloaded"] += http_info["body"]["content_size"]
|
||||
|
||||
stop = shouldStop()
|
||||
stop = should_stop()
|
||||
|
||||
response_message = httpInfo.get("response_message")
|
||||
if errorInfo:
|
||||
response_message = http_info.get("response_message")
|
||||
if error_info:
|
||||
response_code = 0
|
||||
response_message = errorInfo["error"]
|
||||
response_message = error_info["error"]
|
||||
|
||||
if wsFactory.client:
|
||||
wsFactory.client.sendObject({
|
||||
if ws_factory.client:
|
||||
ws_factory.client.send_object({
|
||||
"type": "download",
|
||||
"job_data": jobData,
|
||||
"url": urlInfo["url"],
|
||||
"job_data": job_data,
|
||||
"url": url_info["url"],
|
||||
"response_code": response_code,
|
||||
"response_message": response_message,
|
||||
})
|
||||
@ -256,33 +259,33 @@ def handleResult(urlInfo, recordInfo, errorInfo={}, httpInfo={}):
|
||||
return wpull_hook.actions.NORMAL
|
||||
|
||||
|
||||
def handleResponse(urlInfo, recordInfo, httpInfo):
|
||||
return handleResult(urlInfo, recordInfo, httpInfo=httpInfo)
|
||||
def handle_response(url_info, record_info, http_info):
|
||||
return handle_result(url_info, record_info, http_info=http_info)
|
||||
|
||||
|
||||
def handleError(urlInfo, recordInfo, errorInfo):
|
||||
return handleResult(urlInfo, recordInfo, errorInfo=errorInfo)
|
||||
def handle_error(url_info, record_info, error_info):
|
||||
return handle_result(url_info, record_info, error_info=error_info)
|
||||
|
||||
|
||||
# TODO: check only every 5 seconds max
|
||||
def shouldStop():
|
||||
return os.path.exists(os.path.join(workingDir, "stop"))
|
||||
def should_stop():
|
||||
return os.path.exists(os.path.join(working_dir, "stop"))
|
||||
|
||||
|
||||
# TODO: check only every 5 seconds max
|
||||
def updateIgoffInJobData():
|
||||
igoff = os.path.exists(os.path.join(workingDir, "igoff"))
|
||||
jobData["suppress_ignore_reports"] = igoff
|
||||
def update_igoff_in_job_data():
|
||||
igoff = os.path.exists(os.path.join(working_dir, "igoff"))
|
||||
job_data["suppress_ignore_reports"] = igoff
|
||||
return igoff
|
||||
|
||||
|
||||
def maybeLogIgnore(url, pattern):
|
||||
if not updateIgoffInJobData():
|
||||
printToReal("IGNOR %s by %s" % (url, pattern))
|
||||
if wsFactory.client:
|
||||
wsFactory.client.sendObject({
|
||||
def maybe_log_ignore(url, pattern):
|
||||
if not update_igoff_in_job_data():
|
||||
print_to_real("IGNOR %s by %s" % (url, pattern))
|
||||
if ws_factory.client:
|
||||
ws_factory.client.send_object({
|
||||
"type": "ignore",
|
||||
"job_data": jobData,
|
||||
"job_data": job_data,
|
||||
"url": url,
|
||||
"pattern": pattern
|
||||
})
|
||||
@ -292,74 +295,75 @@ def maybeLogIgnore(url, pattern):
|
||||
ICY_FIELD_PATTERN = re.compile('icy-|ice-|x-audiocast-', re.IGNORECASE)
|
||||
ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE)
|
||||
|
||||
def handlePreResponse(urlInfo, urlRecord, responseInfo):
|
||||
url = urlInfo['url']
|
||||
def handle_pre_response(url_info, url_record, response_info):
|
||||
url = url_info['url']
|
||||
|
||||
# Check if server version starts with ICY
|
||||
if responseInfo.get('version', '') == 'ICY':
|
||||
maybeLogIgnore(url, '[icy version]')
|
||||
if response_info.get('version', '') == 'ICY':
|
||||
maybe_log_ignore(url, '[icy version]')
|
||||
return wpull_hook.actions.FINISH
|
||||
|
||||
# Loop through all the server headers for matches
|
||||
for field, value in responseInfo.get('fields', []):
|
||||
for field, value in response_info.get('fields', []):
|
||||
if ICY_FIELD_PATTERN.match(field):
|
||||
maybeLogIgnore(url, '[icy field]')
|
||||
maybe_log_ignore(url, '[icy field]')
|
||||
return wpull_hook.actions.FINISH
|
||||
|
||||
if field == 'Server' and ICY_VALUE_PATTERN.match(value):
|
||||
maybeLogIgnore(url, '[icy server]')
|
||||
maybe_log_ignore(url, '[icy server]')
|
||||
return wpull_hook.actions.FINISH
|
||||
|
||||
# Nothing matched, allow download
|
||||
printToReal(url + " ...")
|
||||
print_to_real(url + " ...")
|
||||
return wpull_hook.actions.NORMAL
|
||||
|
||||
|
||||
def stdoutWriteToBoth(message):
|
||||
def stdout_write_both(message):
|
||||
assert isinstance(message, bytes), message
|
||||
try:
|
||||
realStdoutWrite(message)
|
||||
if wsFactory.client:
|
||||
wsFactory.client.sendObject({
|
||||
real_stdout_write(message)
|
||||
if ws_factory.client:
|
||||
ws_factory.client.send_object({
|
||||
"type": "stdout",
|
||||
"job_data": jobData,
|
||||
"job_data": job_data,
|
||||
"message": message.decode("utf-8")
|
||||
})
|
||||
except Exception as e:
|
||||
realStderrWrite((str(e) + "\n").encode("utf-8"))
|
||||
real_stderr_write((str(e) + "\n").encode("utf-8"))
|
||||
|
||||
|
||||
def stderrWriteToBoth(message):
|
||||
def stderr_write_both(message):
|
||||
assert isinstance(message, bytes), message
|
||||
try:
|
||||
realStderrWrite(message)
|
||||
if wsFactory.client:
|
||||
wsFactory.client.sendObject({
|
||||
real_stderr_write(message)
|
||||
if ws_factory.client:
|
||||
ws_factory.client.send_object({
|
||||
"type": "stderr",
|
||||
"job_data": jobData,
|
||||
"job_data": job_data,
|
||||
"message": message.decode("utf-8")
|
||||
})
|
||||
except Exception as e:
|
||||
realStderrWrite((str(e) + "\n").encode("utf-8"))
|
||||
real_stderr_write((str(e) + "\n").encode("utf-8"))
|
||||
|
||||
sys.stdout.buffer.write = stdoutWriteToBoth
|
||||
sys.stderr.buffer.write = stderrWriteToBoth
|
||||
sys.stdout.buffer.write = stdout_write_both
|
||||
sys.stderr.buffer.write = stderr_write_both
|
||||
|
||||
|
||||
def exitStatus(code):
|
||||
def exit_status(code):
|
||||
print()
|
||||
print("Finished grab {} {} with exit code {}".format(jobData["ident"], jobData["url"], code))
|
||||
print("Output is in directory:\n{}".format(workingDir))
|
||||
print("Finished grab {} {} with exit code {}".format(
|
||||
job_data["ident"], job_data["url"], code))
|
||||
print("Output is in directory:\n{}".format(working_dir))
|
||||
return code
|
||||
|
||||
|
||||
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
|
||||
|
||||
wpull_hook.callbacks.version = 2
|
||||
wpull_hook.callbacks.accept_url = acceptURL
|
||||
wpull_hook.callbacks.queued_url = queuedURL
|
||||
wpull_hook.callbacks.dequeued_url = dequeuedURL
|
||||
wpull_hook.callbacks.handle_response = handleResponse
|
||||
wpull_hook.callbacks.handle_error = handleError
|
||||
wpull_hook.callbacks.handle_pre_response = handlePreResponse
|
||||
wpull_hook.callbacks.exit_status = exitStatus
|
||||
wpull_hook.callbacks.accept_url = accept_url
|
||||
wpull_hook.callbacks.queued_url = queued_url
|
||||
wpull_hook.callbacks.dequeued_url = dequeued_url
|
||||
wpull_hook.callbacks.handle_response = handle_response
|
||||
wpull_hook.callbacks.handle_error = handle_error
|
||||
wpull_hook.callbacks.handle_pre_response = handle_pre_response
|
||||
wpull_hook.callbacks.exit_status = exit_status
|
||||
|
Loading…
x
Reference in New Issue
Block a user