grab-site/libgrabsite/wpull_hooks.py
Ivan Kozik cdd7928750 Use DIR/scrape file to control whether to scrape for new URLs in responses
present = do scrape
missing = don't scrape
2018-08-28 01:22:19 +00:00

600 lines
16 KiB
Python

import re
import os
import sys
import json
import time
import pprint
import signal
import random
import functools
import traceback
import trollius as asyncio
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
from libgrabsite.ignoracle import Ignoracle, parameterize_record_info
import libgrabsite
real_stdout_write = sys.stdout.buffer.write
real_stderr_write = sys.stderr.buffer.write
def print_to_terminal(s):
real_stdout_write((s + "\n").encode("utf-8"))
sys.stdout.buffer.flush()
class GrabberClientProtocol(WebSocketClientProtocol):
def on_open(self):
self.factory.client = self
self.send_object({
"type": "hello",
"mode": "grabber",
"url": job_data["url"]
})
def on_close(self, was_clean, code, reason):
self.factory.client = None
print_to_terminal(
"Disconnected from ws:// server with (was_clean, code, reason): {!r}"
.format((was_clean, code, reason)))
asyncio.ensure_future(connect_to_server())
def send_object(self, obj):
self.sendMessage(json.dumps(obj).encode("utf-8"))
onOpen = on_open
onClose = on_close
class GrabberClientFactory(WebSocketClientFactory):
protocol = GrabberClientProtocol
def __init__(self):
super().__init__()
self.client = None
ws_factory = GrabberClientFactory()
class Decayer(object):
def __init__(self, initial, multiplier, maximum):
"""
initial - initial number to return
multiplier - multiply number by this value after each call to decay()
maximum - cap number at this value
"""
self.initial = initial
self.multiplier = multiplier
self.maximum = maximum
self.reset()
def reset(self):
# First call to .decay() will multiply, but we want to get the `intitial`
# value on the first call to .decay(), so divide.
self.current = self.initial / self.multiplier
return self.current
def decay(self):
self.current = min(self.current * self.multiplier, self.maximum)
return self.current
@asyncio.coroutine
def connect_to_server():
host = os.environ.get('GRAB_SITE_HOST', '127.0.0.1')
port = int(os.environ.get('GRAB_SITE_PORT', 29000))
decayer = Decayer(0.25, 1.5, 8)
while True:
try:
yield from loop.create_connection(ws_factory, host, port)
except OSError:
delay = decayer.decay()
print_to_terminal(
"Could not connect to ws://{}:{}, retrying in {:.1f} seconds..."
.format(host, port, delay))
yield from asyncio.sleep(delay)
else:
print_to_terminal("Connected to ws://{}:{}".format(host, port))
break
loop = asyncio.get_event_loop()
asyncio.ensure_future(connect_to_server())
def graceful_stop_callback():
print_to_terminal("\n^C detected, creating 'stop' file, please wait for exit...")
with open(os.path.join(working_dir, "stop"), "wb") as f:
pass
def forceful_stop_callback():
loop.stop()
try:
loop.add_signal_handler(signal.SIGINT, graceful_stop_callback)
loop.add_signal_handler(signal.SIGTERM, forceful_stop_callback)
except NotImplementedError:
# Not supported on Windows
pass
ignore_sets_path = os.path.join(os.path.dirname(libgrabsite.__file__), "ignore_sets")
def get_patterns_for_ignore_set(name):
assert name != "", name
with open(os.path.join(ignore_sets_path, name), "r", encoding="utf-8") as f:
lines = f.read().strip("\n").split("\n")
lines = filter(lambda line: line and not line.startswith("# "), lines)
return lines
working_dir = os.environ['GRAB_SITE_WORKING_DIR']
# Don't swallow during startup
really_swallow_exceptions = False
def swallow_exception(f):
@functools.wraps(f)
def wrapper(*args, **kwargs):
global really_swallow_exceptions
try:
return f(*args, **kwargs)
except Exception:
if not really_swallow_exceptions:
raise
traceback.print_exc()
return wrapper
CONTROL_FILE_CACHE_SEC = 1.5
def caching_decorator(f):
cache = {}
@functools.wraps(f)
def wrapper(path):
timestamp, val = cache.get(path, (-CONTROL_FILE_CACHE_SEC, None))
if timestamp > (time.monotonic() - CONTROL_FILE_CACHE_SEC):
#print("returning cached value {} {}".format(path, val))
return val
val = f(path)
cache[path] = (time.monotonic(), val)
#print("returning new value {} {}".format(path, val))
return val
return wrapper
@caching_decorator
def path_exists_with_cache(path):
return os.path.exists(path)
@caching_decorator
def mtime_with_cache(path):
return os.stat(path).st_mtime
class FileChangedWatcher(object):
def __init__(self, fname):
self.fname = fname
# Use a bogus mtime so that has_changed() returns True
# at least once
self.last_mtime = -1
def has_changed(self):
now_mtime = mtime_with_cache(self.fname)
changed = now_mtime != self.last_mtime
self.last_mtime = now_mtime
if changed:
print("Picked up the changes to %s" % self.fname)
return changed
igsets_watcher = FileChangedWatcher(os.path.join(working_dir, "igsets"))
ignores_watcher = FileChangedWatcher(os.path.join(working_dir, "ignores"))
delay_watcher = FileChangedWatcher(os.path.join(working_dir, "delay"))
concurrency_watcher = FileChangedWatcher(os.path.join(working_dir, "concurrency"))
max_content_length_watcher = FileChangedWatcher(os.path.join(working_dir, "max_content_length"))
custom_hooks_watcher = FileChangedWatcher(os.path.join(working_dir, "custom_hooks.py"))
ignoracle = Ignoracle()
@swallow_exception
def update_ignoracle():
if not (igsets_watcher.has_changed() or ignores_watcher.has_changed()):
return
with open(os.path.join(working_dir, "igsets"), "r") as f:
igsets = f.read().strip("\r\n\t ,").split(',')
with open(os.path.join(working_dir, "ignores"), "r") as f:
ignores = set(ig for ig in f.read().strip("\r\n").split('\n') if ig != "")
for igset in igsets:
patterns = get_patterns_for_ignore_set(igset)
ignores.update(patterns)
print_to_terminal("Using these %d ignores:" % len(ignores))
for ig in sorted(ignores):
print_to_terminal("\t" + ig)
ignoracle.set_patterns(ignores)
update_ignoracle()
def should_ignore_url(url, record_info):
"""
Returns whether a URL should be ignored.
"""
parameters = parameterize_record_info(record_info)
return ignoracle.ignores(url, **parameters)
all_start_urls = open(os.path.join(working_dir, "all_start_urls")).read().rstrip("\n").split("\n")
def accept_url(url_info, record_info, verdict, reasons):
update_ignoracle()
url = url_info['url']
if url.startswith('data:'):
# data: URLs aren't something you can grab, so drop them to avoid ignore
# checking and ignore logging.
return False
# Don't apply ignores to any of the start URLs
if url in all_start_urls:
return True
pattern = should_ignore_url(url, record_info)
if pattern:
maybe_log_ignore(url, pattern)
return False
# If we get here, none of our ignores apply. Return the original verdict.
return verdict
def queued_url(url_info):
job_data["items_queued"] += 1
def dequeued_url(url_info, record_info):
job_data["items_downloaded"] += 1
job_data = {
"ident": open(os.path.join(working_dir, "id")).read().strip(),
"url": open(os.path.join(working_dir, "start_url")).read().strip(),
"started_at": os.stat(os.path.join(working_dir, "start_url")).st_mtime,
"max_content_length": -1,
"suppress_ignore_reports": True,
"video": True,
"scrape": True,
"concurrency": 2,
"bytes_downloaded": 0,
"items_queued": 0,
"items_downloaded": 0,
"delay_min": 0,
"delay_max": 0,
"r1xx": 0,
"r2xx": 0,
"r3xx": 0,
"r4xx": 0,
"r5xx": 0,
"runk": 0,
}
def handle_result(url_info, record_info, error_info={}, http_info={}):
#print("url_info", url_info)
#print("record_info", record_info)
#print("error_info", error_info)
#print("http_info", http_info)
update_igoff()
response_code = 0
if http_info.get("response_code"):
response_code = http_info.get("response_code")
response_code_str = str(http_info["response_code"])
if len(response_code_str) == 3 and response_code_str[0] in "12345":
job_data["r%sxx" % response_code_str[0]] += 1
else:
job_data["runk"] += 1
if http_info.get("body"):
job_data["bytes_downloaded"] += http_info["body"]["content_size"]
stop = should_stop()
response_message = http_info.get("response_message")
if error_info:
response_code = 0
response_message = error_info["error"]
if ws_factory.client:
ws_factory.client.send_object({
"type": "download",
"job_data": job_data,
"url": url_info["url"],
"response_code": response_code,
"response_message": response_message,
})
if stop:
return wpull_hook.actions.STOP
return wpull_hook.actions.NORMAL
def handle_response(url_info, record_info, http_info):
return handle_result(url_info, record_info, http_info=http_info)
def handle_error(url_info, record_info, error_info):
return handle_result(url_info, record_info, error_info=error_info)
stop_path = os.path.join(working_dir, "stop")
def should_stop():
return path_exists_with_cache(stop_path)
igoff_path = os.path.join(working_dir, "igoff")
def update_igoff():
job_data["suppress_ignore_reports"] = path_exists_with_cache(igoff_path)
update_igoff()
video_path = os.path.join(working_dir, "video")
def update_video():
job_data["video"] = path_exists_with_cache(video_path)
update_video()
scrape_path = os.path.join(working_dir, "scrape")
def update_scrape():
scrape = path_exists_with_cache(scrape_path)
job_data["scrape"] = scrape
if not scrape:
# Empty the list of scrapers, which will stop scraping for new URLs
# but still keep going through what is already in the queue.
wpull_hook.factory.get('DemuxDocumentScraper')._document_scrapers = []
update_scrape()
def maybe_log_ignore(url, pattern):
update_igoff()
if not job_data["suppress_ignore_reports"]:
print_to_terminal("IGNOR %s\n by %s" % (url, pattern))
if ws_factory.client:
ws_factory.client.send_object({
"type": "ignore",
"job_data": job_data,
"url": url,
"pattern": pattern
})
ICY_FIELD_PATTERN = re.compile('icy-|ice-|x-audiocast-', re.IGNORECASE)
ICY_VALUE_PATTERN = re.compile('icecast', re.IGNORECASE)
def get_content_length(response_info):
try:
return int(list(p for p in response_info["fields"] if p[0] == "Content-Length")[0][1])
except (IndexError, ValueError):
return -1
def has_content_type_video(response_info):
try:
t = list(p for p in response_info["fields"] if p[0] == "Content-Type")[0][1]
return t.lower().startswith("video/")
except (IndexError, ValueError):
return False
# Excluded vob, mpeg, mpg, avi because they are not found on the general web
video_exts = set("webm mp4 m4v mkv ts 3gp 3g2 flv mov wmv ogv ogm".split(" "))
def has_video_ext(url):
ext = url.rsplit('.')[-1]
return ext.lower() in video_exts
skipped_videos_path = os.path.join(working_dir, "skipped_videos")
skipped_videos = open(skipped_videos_path, "w", encoding="utf-8")
skipped_max_content_length_path = os.path.join(working_dir, "skipped_max_content_length")
skipped_max_content_length = open(skipped_max_content_length_path, "w", encoding="utf-8")
def handle_pre_response(url_info, url_record, response_info):
url = url_info['url']
update_scrape()
update_max_content_length()
if job_data["max_content_length"] != -1:
##pprint.pprint(response_info)
length = get_content_length(response_info)
##print((length, job_data["max_content_length"]))
if length > job_data["max_content_length"]:
skipped_max_content_length.write(url + "\n")
skipped_max_content_length.flush()
maybe_log_ignore(url, '[content-length %d over limit %d]' % (
length, job_data["max_content_length"]))
return wpull_hook.actions.FINISH
update_video()
if not job_data["video"]:
if has_content_type_video(response_info) or has_video_ext(url):
skipped_videos.write(url + "\n")
skipped_videos.flush()
maybe_log_ignore(url, '[video]')
return wpull_hook.actions.FINISH
# Check if server version starts with ICY
if response_info.get('version', '') == 'ICY':
maybe_log_ignore(url, '[icy version]')
return wpull_hook.actions.FINISH
# Loop through all the server headers for matches
for field, value in response_info.get('fields', []):
if ICY_FIELD_PATTERN.match(field):
maybe_log_ignore(url, '[icy field]')
return wpull_hook.actions.FINISH
if field == 'Server' and ICY_VALUE_PATTERN.match(value):
maybe_log_ignore(url, '[icy server]')
return wpull_hook.actions.FINISH
# Nothing matched, allow download
print_to_terminal(url + " ...")
return wpull_hook.actions.NORMAL
def stdout_write_both(message):
assert isinstance(message, bytes), message
try:
real_stdout_write(message)
if ws_factory.client:
ws_factory.client.send_object({
"type": "stdout",
"job_data": job_data,
"message": message.decode("utf-8")
})
except Exception as e:
real_stderr_write((str(e) + "\n").encode("utf-8"))
def stderr_write_both(message):
assert isinstance(message, bytes), message
try:
real_stderr_write(message)
if ws_factory.client:
ws_factory.client.send_object({
"type": "stderr",
"job_data": job_data,
"message": message.decode("utf-8")
})
except Exception as e:
real_stderr_write((str(e) + "\n").encode("utf-8"))
sys.stdout.buffer.write = stdout_write_both
sys.stderr.buffer.write = stderr_write_both
def exit_status(code):
print()
print("Finished grab {} {} with exit code {}".format(
job_data["ident"], job_data["url"], code))
print("Output is in directory:\n{}".format(working_dir))
return code
@swallow_exception
def update_max_content_length():
if not max_content_length_watcher.has_changed():
return
with open(max_content_length_watcher.fname, "r") as f:
job_data["max_content_length"] = int(f.read().strip())
update_max_content_length()
@swallow_exception
def update_delay():
if not delay_watcher.has_changed():
return
with open(delay_watcher.fname, "r") as f:
content = f.read().strip()
if "-" in content:
job_data["delay_min"], job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
else:
job_data["delay_min"] = job_data["delay_max"] = int(content)
update_delay()
@swallow_exception
def update_concurrency():
if not concurrency_watcher.has_changed():
return
with open(concurrency_watcher.fname, "r") as f:
concurrency = int(f.read().strip())
if concurrency < 1:
print("Warning: using 1 for concurrency instead of %r because it cannot be < 1" % (concurrency,))
concurrency = 1
job_data["concurrency"] = concurrency
# If we call this with 0, trollius blows up with
# ValueError('Set of coroutines/Futures is empty.')
wpull_hook.factory.get('Engine').set_concurrent(concurrency)
update_concurrency()
def wait_time(seconds, url_info, record_info, response_info, error_info):
update_delay()
update_concurrency()
update_custom_hooks()
return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000
def get_urls(filename, url_info, document_info):
url = url_info["url"]
extra_urls = None
# If we see this URL, also queue the URL for the :orig quality image
if url.startswith("https://pbs.twimg.com/media/"):
new_url = re.sub(":[a-z]{1,10}$", "", url) + ":orig"
# see wpull/item.py:LinkType
extra_urls = [dict(url=new_url, link_type="media", inline=True)]
# Quora shows login-required screen unless you add ?share=1
elif url.startswith("https://www.quora.com/") and not "?" in url:
new_url = url + "?share=1"
extra_urls = [dict(url=new_url, link_type="html")]
return extra_urls
@swallow_exception
def update_custom_hooks():
if not custom_hooks_watcher.has_changed():
return
assert 3 in wpull_hook.callbacks.AVAILABLE_VERSIONS
# Set these every time, because custom_hooks.py may be wrapping them,
# and when custom_hooks.py is reloaded, we want it re-wrap the base functions
# instead of its already-wrapped functions.
wpull_hook.callbacks.version = 3
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.queued_url = queued_url
wpull_hook.callbacks.dequeued_url = dequeued_url
wpull_hook.callbacks.handle_response = handle_response
wpull_hook.callbacks.handle_error = handle_error
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.exit_status = exit_status
wpull_hook.callbacks.wait_time = wait_time
wpull_hook.callbacks.get_urls = get_urls
custom_hooks_filename = os.path.join(working_dir, "custom_hooks.py")
with open(custom_hooks_filename, 'rb') as in_file:
code = compile(in_file.read(), custom_hooks_filename, 'exec')
context = {
'wpull_hook': wpull_hook,
'maybe_log_ignore': maybe_log_ignore,
'print_to_terminal': print_to_terminal
}
exec(code, context, context)
update_custom_hooks()
really_swallow_exceptions = True