2015-04-05 17:15:27 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2022-02-13 22:39:26 +01:00
|
|
|
# Copyright 2014-2022 Mike Fährmann
|
2015-04-05 17:15:27 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2021-06-12 00:20:59 +02:00
|
|
|
import sys
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
from . import version, config, option, output, extractor, job, util, exception
|
2016-10-04 14:33:50 +02:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
__author__ = "Mike Fährmann"
|
2021-04-12 01:55:55 +02:00
|
|
|
__copyright__ = "Copyright 2014-2021 Mike Fährmann"
|
2017-01-30 19:40:15 +01:00
|
|
|
__license__ = "GPLv2"
|
2014-10-12 21:56:44 +02:00
|
|
|
__maintainer__ = "Mike Fährmann"
|
2017-01-30 19:40:15 +01:00
|
|
|
__email__ = "mike_faehrmann@web.de"
|
2017-03-27 11:59:27 +02:00
|
|
|
__version__ = version.__version__
|
2018-04-04 17:30:42 +02:00
|
|
|
|
|
|
|
|
2017-06-09 20:12:15 +02:00
|
|
|
def progress(urls, pformat):
|
2018-01-27 01:05:17 +01:00
|
|
|
"""Wrapper around urls to output a simple progress indicator"""
|
2017-06-09 20:12:15 +02:00
|
|
|
if pformat is True:
|
|
|
|
pformat = "[{current}/{total}] {url}"
|
|
|
|
pinfo = {"total": len(urls)}
|
|
|
|
for pinfo["current"], pinfo["url"] in enumerate(urls, 1):
|
|
|
|
print(pformat.format_map(pinfo), file=sys.stderr)
|
|
|
|
yield pinfo["url"]
|
|
|
|
|
|
|
|
|
2019-02-13 17:39:43 +01:00
|
|
|
def parse_inputfile(file, log):
|
2018-02-15 21:15:33 +01:00
|
|
|
"""Filter and process strings from an input file.
|
2018-02-07 21:47:27 +01:00
|
|
|
|
|
|
|
Lines starting with '#' and empty lines will be ignored.
|
2018-02-15 21:15:33 +01:00
|
|
|
Lines starting with '-' will be interpreted as a key-value pair separated
|
|
|
|
by an '='. where 'key' is a dot-separated option name and 'value' is a
|
|
|
|
JSON-parsable value for it. These config options will be applied while
|
|
|
|
processing the next URL.
|
|
|
|
Lines starting with '-G' are the same as above, except these options will
|
|
|
|
be valid for all following URLs, i.e. they are Global.
|
2018-02-07 21:47:27 +01:00
|
|
|
Everything else will be used as potential URL.
|
|
|
|
|
|
|
|
Example input file:
|
|
|
|
|
2018-02-15 21:15:33 +01:00
|
|
|
# settings global options
|
|
|
|
-G base-directory = "/tmp/"
|
|
|
|
-G skip = false
|
|
|
|
|
|
|
|
# setting local options for the next URL
|
|
|
|
-filename="spaces_are_optional.jpg"
|
|
|
|
-skip = true
|
|
|
|
|
2018-02-07 21:47:27 +01:00
|
|
|
https://example.org/
|
|
|
|
|
2018-02-15 21:15:33 +01:00
|
|
|
# next URL uses default filename and 'skip' is false.
|
2018-02-07 21:47:27 +01:00
|
|
|
https://example.com/index.htm
|
|
|
|
"""
|
2018-02-15 21:15:33 +01:00
|
|
|
gconf = []
|
|
|
|
lconf = []
|
2018-02-07 21:47:27 +01:00
|
|
|
|
|
|
|
for line in file:
|
|
|
|
line = line.strip()
|
|
|
|
|
|
|
|
if not line or line[0] == "#":
|
|
|
|
# empty line or comment
|
|
|
|
continue
|
|
|
|
|
2018-02-15 21:15:33 +01:00
|
|
|
elif line[0] == "-":
|
|
|
|
# config spec
|
|
|
|
if len(line) >= 2 and line[1] == "G":
|
|
|
|
conf = gconf
|
|
|
|
line = line[2:]
|
|
|
|
else:
|
|
|
|
conf = lconf
|
|
|
|
line = line[1:]
|
|
|
|
|
|
|
|
key, sep, value = line.partition("=")
|
|
|
|
if not sep:
|
|
|
|
log.warning("input file: invalid <key>=<value> pair: %s", line)
|
|
|
|
continue
|
|
|
|
|
2018-02-07 21:47:27 +01:00
|
|
|
try:
|
2018-02-15 21:15:33 +01:00
|
|
|
value = json.loads(value.strip())
|
2018-02-07 21:47:27 +01:00
|
|
|
except ValueError as exc:
|
2018-02-15 21:15:33 +01:00
|
|
|
log.warning("input file: unable to parse '%s': %s", value, exc)
|
2018-02-07 21:47:27 +01:00
|
|
|
continue
|
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
key = key.strip().split(".")
|
|
|
|
conf.append((key[:-1], key[-1], value))
|
2018-02-07 21:47:27 +01:00
|
|
|
|
|
|
|
else:
|
|
|
|
# url
|
2018-02-15 21:15:33 +01:00
|
|
|
if gconf or lconf:
|
|
|
|
yield util.ExtendedUrl(line, gconf, lconf)
|
|
|
|
gconf = []
|
|
|
|
lconf = []
|
2018-02-07 21:47:27 +01:00
|
|
|
else:
|
|
|
|
yield line
|
|
|
|
|
|
|
|
|
2014-10-12 21:56:44 +02:00
|
|
|
def main():
|
2015-04-10 17:31:49 +02:00
|
|
|
try:
|
2020-03-23 23:38:55 +01:00
|
|
|
if sys.stdout and sys.stdout.encoding.lower() != "utf-8":
|
2019-02-13 17:39:43 +01:00
|
|
|
output.replace_std_streams()
|
2018-04-04 17:30:42 +02:00
|
|
|
|
2017-03-23 16:29:40 +01:00
|
|
|
parser = option.build_parser()
|
2015-11-14 15:31:07 +01:00
|
|
|
args = parser.parse_args()
|
2019-02-13 17:39:43 +01:00
|
|
|
log = output.initialize_logging(args.loglevel)
|
2015-11-14 15:11:44 +01:00
|
|
|
|
2017-09-08 17:52:00 +02:00
|
|
|
# configuration
|
2017-04-25 17:09:10 +02:00
|
|
|
if args.load_config:
|
|
|
|
config.load()
|
2015-11-14 17:22:56 +01:00
|
|
|
if args.cfgfiles:
|
2019-02-27 16:52:15 +01:00
|
|
|
config.load(args.cfgfiles, strict=True)
|
2017-03-08 16:57:42 +01:00
|
|
|
if args.yamlfiles:
|
2019-02-27 16:52:15 +01:00
|
|
|
config.load(args.yamlfiles, strict=True, fmt="yaml")
|
2021-12-27 23:31:54 +01:00
|
|
|
if args.filename:
|
|
|
|
if args.filename == "/O":
|
|
|
|
args.filename = "{filename}.{extension}"
|
|
|
|
config.set((), "filename", args.filename)
|
|
|
|
if args.directory:
|
|
|
|
config.set((), "base-directory", args.directory)
|
|
|
|
config.set((), "directory", ())
|
2019-05-10 15:32:23 +02:00
|
|
|
if args.postprocessors:
|
2019-11-23 23:50:16 +01:00
|
|
|
config.set((), "postprocessors", args.postprocessors)
|
2019-06-29 23:46:55 +02:00
|
|
|
if args.abort:
|
2019-11-23 23:50:16 +01:00
|
|
|
config.set((), "skip", "abort:" + str(args.abort))
|
2021-06-05 04:00:29 +02:00
|
|
|
if args.terminate:
|
|
|
|
config.set((), "skip", "terminate:" + str(args.terminate))
|
2019-11-23 23:50:16 +01:00
|
|
|
for opts in args.options:
|
|
|
|
config.set(*opts)
|
2017-09-08 17:52:00 +02:00
|
|
|
|
2022-02-13 22:39:26 +01:00
|
|
|
# signals
|
|
|
|
signals = config.get((), "signals-ignore")
|
|
|
|
if signals:
|
|
|
|
import signal
|
|
|
|
if isinstance(signals, str):
|
|
|
|
signals = signals.split(",")
|
|
|
|
for signal_name in signals:
|
|
|
|
signal_num = getattr(signal, signal_name, None)
|
|
|
|
if signal_num is None:
|
|
|
|
log.warning("signal '%s' is not defined", signal_name)
|
|
|
|
else:
|
|
|
|
signal.signal(signal_num, signal.SIG_IGN)
|
|
|
|
|
2020-10-25 03:05:10 +01:00
|
|
|
# extractor modules
|
|
|
|
modules = config.get(("extractor",), "modules")
|
|
|
|
if modules is not None:
|
2021-07-18 00:34:04 +02:00
|
|
|
if isinstance(modules, str):
|
|
|
|
modules = modules.split(",")
|
2020-10-25 03:05:10 +01:00
|
|
|
extractor.modules = modules
|
|
|
|
extractor._module_iter = iter(modules)
|
|
|
|
|
2018-01-27 00:35:18 +01:00
|
|
|
# loglevels
|
2020-01-30 15:11:02 +01:00
|
|
|
output.configure_logging(args.loglevel)
|
2017-04-26 11:33:19 +02:00
|
|
|
if args.loglevel >= logging.ERROR:
|
2019-11-23 23:50:16 +01:00
|
|
|
config.set(("output",), "mode", "null")
|
2017-08-13 20:35:44 +02:00
|
|
|
elif args.loglevel <= logging.DEBUG:
|
2018-01-27 01:05:17 +01:00
|
|
|
import platform
|
2018-07-17 22:44:32 +02:00
|
|
|
import subprocess
|
|
|
|
import os.path
|
2018-01-27 01:05:17 +01:00
|
|
|
import requests
|
2018-07-17 22:44:32 +02:00
|
|
|
|
2021-12-10 03:18:02 +01:00
|
|
|
extra = ""
|
|
|
|
if getattr(sys, "frozen", False):
|
|
|
|
extra = " - Executable"
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
out, err = subprocess.Popen(
|
|
|
|
("git", "rev-parse", "--short", "HEAD"),
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
cwd=os.path.dirname(os.path.abspath(__file__)),
|
|
|
|
).communicate()
|
|
|
|
if out and not err:
|
|
|
|
extra = " - Git HEAD: " + out.decode().rstrip()
|
|
|
|
except (OSError, subprocess.SubprocessError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
log.debug("Version %s%s", __version__, extra)
|
2017-08-13 20:35:44 +02:00
|
|
|
log.debug("Python %s - %s",
|
|
|
|
platform.python_version(), platform.platform())
|
2017-12-27 22:12:40 +01:00
|
|
|
try:
|
2018-01-27 01:05:17 +01:00
|
|
|
log.debug("requests %s - urllib3 %s",
|
|
|
|
requests.__version__,
|
|
|
|
requests.packages.urllib3.__version__)
|
2017-12-27 22:12:40 +01:00
|
|
|
except AttributeError:
|
|
|
|
pass
|
2015-11-14 15:11:44 +01:00
|
|
|
|
|
|
|
if args.list_modules:
|
|
|
|
for module_name in extractor.modules:
|
|
|
|
print(module_name)
|
2016-09-14 09:51:01 +02:00
|
|
|
elif args.list_extractors:
|
|
|
|
for extr in extractor.extractors():
|
2017-06-28 18:51:47 +02:00
|
|
|
if not extr.__doc__:
|
|
|
|
continue
|
2016-09-14 09:51:01 +02:00
|
|
|
print(extr.__name__)
|
2017-06-28 18:51:47 +02:00
|
|
|
print(extr.__doc__)
|
|
|
|
print("Category:", extr.category,
|
|
|
|
"- Subcategory:", extr.subcategory)
|
2019-02-06 17:24:44 +01:00
|
|
|
test = next(extr._get_tests(), None)
|
|
|
|
if test:
|
|
|
|
print("Example :", test[0])
|
2016-09-14 09:51:01 +02:00
|
|
|
print()
|
2019-04-25 21:30:16 +02:00
|
|
|
elif args.clear_cache:
|
|
|
|
from . import cache
|
|
|
|
log = logging.getLogger("cache")
|
2021-05-03 22:24:15 +02:00
|
|
|
cnt = cache.clear(args.clear_cache)
|
2019-04-25 21:30:16 +02:00
|
|
|
|
|
|
|
if cnt is None:
|
|
|
|
log.error("Database file not available")
|
|
|
|
else:
|
|
|
|
log.info(
|
|
|
|
"Deleted %d %s from '%s'",
|
|
|
|
cnt, "entry" if cnt == 1 else "entries", cache._path(),
|
|
|
|
)
|
2015-11-14 15:11:44 +01:00
|
|
|
else:
|
2021-03-04 21:37:26 +01:00
|
|
|
if not args.urls and not args.inputfiles:
|
2017-08-13 20:35:44 +02:00
|
|
|
parser.error(
|
|
|
|
"The following arguments are required: URL\n"
|
|
|
|
"Use 'gallery-dl --help' to get a list of all options.")
|
2016-07-21 13:13:53 +02:00
|
|
|
|
2015-12-10 02:14:28 +01:00
|
|
|
if args.list_urls:
|
2016-07-14 14:25:56 +02:00
|
|
|
jobtype = job.UrlJob
|
2017-02-17 22:18:16 +01:00
|
|
|
jobtype.maxdepth = args.list_urls
|
2021-04-12 01:55:55 +02:00
|
|
|
if config.get(("output",), "fallback", True):
|
|
|
|
jobtype.handle_url = \
|
|
|
|
staticmethod(jobtype.handle_url_fallback)
|
2015-12-10 02:14:28 +01:00
|
|
|
else:
|
2019-05-10 22:05:57 +02:00
|
|
|
jobtype = args.jobtype or job.DownloadJob
|
2016-07-21 13:13:53 +02:00
|
|
|
|
2016-12-04 16:11:54 +01:00
|
|
|
urls = args.urls
|
2021-03-04 21:37:26 +01:00
|
|
|
if args.inputfiles:
|
|
|
|
for inputfile in args.inputfiles:
|
|
|
|
try:
|
|
|
|
if inputfile == "-":
|
|
|
|
if sys.stdin:
|
|
|
|
urls += parse_inputfile(sys.stdin, log)
|
|
|
|
else:
|
|
|
|
log.warning("input file: stdin is not readable")
|
2020-03-25 22:30:24 +01:00
|
|
|
else:
|
2021-03-04 21:37:26 +01:00
|
|
|
with open(inputfile, encoding="utf-8") as file:
|
|
|
|
urls += parse_inputfile(file, log)
|
|
|
|
except OSError as exc:
|
|
|
|
log.warning("input file: %s", exc)
|
2017-05-27 16:16:57 +02:00
|
|
|
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
# unsupported file logging handler
|
2019-02-13 17:39:43 +01:00
|
|
|
handler = output.setup_logging_handler(
|
|
|
|
"unsupportedfile", fmt="{message}")
|
implement logging options
Standard logging to stderr, logfiles, and unsupported URL files (which
are now handled through the logging module) can now be configured by
setting their respective option keys (log, logfile, unsupportedfile)
to a dict and specifying the following options;
- format:
format string for logging messages
available keys: see [1]
default: "[{name}][{levelname}] {message}"
- format-date:
format string for {asctime} fields in logging messages
available keys: see [2]
default: "%Y-%m-%d %H:%M:%S"
- level:
the lowercase levelname until which the logger should activate;
available levels are debug, info, warning, error, exception
default: "info"
- path:
path of the file to be written to
- mode:
'mode' argument when opening the specified file
can be either "w" to truncate the file or "a" to append to it (see [3])
If 'output.log', '.logfile', or '.unsupportedfile' is a string, it will
be interpreted, as it has been, as the filepath
(or as format string for .log)
[1] https://docs.python.org/3/library/logging.html#logrecord-attributes
[2] https://docs.python.org/3/library/time.html#time.strftime
[3] https://docs.python.org/3/library/functions.html#open
2018-05-01 17:54:52 +02:00
|
|
|
if handler:
|
|
|
|
ulog = logging.getLogger("unsupported")
|
|
|
|
ulog.addHandler(handler)
|
|
|
|
ulog.propagate = False
|
|
|
|
job.Job.ulog = ulog
|
2016-12-04 16:11:54 +01:00
|
|
|
|
2019-11-23 23:50:16 +01:00
|
|
|
pformat = config.get(("output",), "progress", True)
|
2017-06-09 20:12:15 +02:00
|
|
|
if pformat and len(urls) > 1 and args.loglevel < logging.ERROR:
|
|
|
|
urls = progress(urls, pformat)
|
|
|
|
|
2019-10-27 23:34:52 +01:00
|
|
|
retval = 0
|
2016-12-04 16:11:54 +01:00
|
|
|
for url in urls:
|
2016-07-14 14:57:42 +02:00
|
|
|
try:
|
2017-04-18 11:38:48 +02:00
|
|
|
log.debug("Starting %s for '%s'", jobtype.__name__, url)
|
2018-02-07 21:47:27 +01:00
|
|
|
if isinstance(url, util.ExtendedUrl):
|
2019-11-23 23:50:16 +01:00
|
|
|
for opts in url.gconfig:
|
|
|
|
config.set(*opts)
|
2018-02-15 21:15:33 +01:00
|
|
|
with config.apply(url.lconfig):
|
2019-10-27 23:34:52 +01:00
|
|
|
retval |= jobtype(url.value).run()
|
2018-02-07 21:47:27 +01:00
|
|
|
else:
|
2019-10-27 23:34:52 +01:00
|
|
|
retval |= jobtype(url).run()
|
2021-05-12 02:22:28 +02:00
|
|
|
except exception.TerminateExtraction:
|
|
|
|
pass
|
2016-07-14 14:57:42 +02:00
|
|
|
except exception.NoExtractorError:
|
2017-03-11 01:47:57 +01:00
|
|
|
log.error("No suitable extractor found for '%s'", url)
|
2019-10-29 15:56:54 +01:00
|
|
|
retval |= 64
|
2019-10-27 23:34:52 +01:00
|
|
|
return retval
|
2017-02-25 23:53:31 +01:00
|
|
|
|
2015-04-10 17:31:49 +02:00
|
|
|
except KeyboardInterrupt:
|
2019-09-10 16:46:38 +02:00
|
|
|
sys.exit("\nKeyboardInterrupt")
|
2016-08-05 10:25:31 +02:00
|
|
|
except BrokenPipeError:
|
|
|
|
pass
|
2019-10-27 23:34:52 +01:00
|
|
|
except OSError as exc:
|
2016-08-05 10:25:31 +02:00
|
|
|
import errno
|
2017-08-13 20:35:44 +02:00
|
|
|
if exc.errno != errno.EPIPE:
|
2016-08-05 10:25:31 +02:00
|
|
|
raise
|
2019-10-27 23:34:52 +01:00
|
|
|
return 1
|