[postprocessor:metadata] implement archive options ()

'archive', 'archive-format', and 'archive-prefix'
This commit is contained in:
Mike Fährmann 2022-03-20 21:16:46 +01:00
parent be3492776b
commit 9bd27b1b8d
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 63 additions and 18 deletions

@ -599,7 +599,9 @@ Description
memory requirements are significantly lower when the
amount of stored IDs gets reasonably large.
Note: archive paths support regular `format string`_ replacements,
Note: Archive files that do not already exist get generated automatically.
Note: Archive paths support regular `format string`_ replacements,
but be aware that using external inputs for building local paths
may pose a security risk.
@ -3139,6 +3141,19 @@ Description
Note: Only applies for ``"mode": "custom"``.
metadata.archive
----------------
Type
|Path|_
Description
File to store IDs of generated metadata files in,
similar to `extractor.*.archive`_.
``archive-format`` and ``archive-prefix`` options,
akin to `extractor.*.archive-format`_ and `extractor.*.archive-prefix`_,
are supported as well.
metadata.mtime
--------------
Type

@ -389,8 +389,10 @@ class DownloadJob(Job):
def initialize(self, kwdict=None):
"""Delayed initialization of PathFormat, etc."""
cfg = self.extractor.config
pathfmt = self.pathfmt = path.PathFormat(self.extractor)
extr = self.extractor
cfg = extr.config
pathfmt = self.pathfmt = path.PathFormat(extr)
if kwdict:
pathfmt.set_directory(kwdict)
@ -403,17 +405,18 @@ class DownloadJob(Job):
archive = cfg("archive")
if archive:
archive = util.expand_path(archive)
archive_format = (cfg("archive-prefix", extr.category) +
cfg("archive-format", extr.archive_fmt))
try:
if "{" in archive:
archive = formatter.parse(archive).format_map(kwdict)
self.archive = util.DownloadArchive(archive, self.extractor)
self.archive = util.DownloadArchive(archive, archive_format)
except Exception as exc:
self.extractor.log.warning(
extr.log.warning(
"Failed to open download archive at '%s' ('%s: %s')",
archive, exc.__class__.__name__, exc)
else:
self.extractor.log.debug(
"Using download archive '%s'", archive)
extr.log.debug("Using download archive '%s'", archive)
skip = cfg("skip", True)
if skip:
@ -435,7 +438,7 @@ class DownloadJob(Job):
if self.archive:
self.archive.check = pathfmt.exists
postprocessors = self.extractor.config_accumulate("postprocessors")
postprocessors = extr.config_accumulate("postprocessors")
if postprocessors:
self.hooks = collections.defaultdict(list)
pp_log = self.get_logger("postprocessor")
@ -453,7 +456,7 @@ class DownloadJob(Job):
clist = pp_dict.get("blacklist")
negate = True
if clist and not util.build_extractor_filter(
clist, negate)(self.extractor):
clist, negate)(extr):
continue
name = pp_dict.get("name")
@ -471,8 +474,7 @@ class DownloadJob(Job):
pp_list.append(pp_obj)
if pp_list:
self.extractor.log.debug(
"Active postprocessor modules: %s", pp_list)
extr.log.debug("Active postprocessor modules: %s", pp_list)
if "init" in self.hooks:
for callback in self.hooks["init"]:
callback(pathfmt)

@ -59,9 +59,35 @@ class MetadataPP(PostProcessor):
events = events.split(",")
job.register_hooks({event: self.run for event in events}, options)
archive = options.get("archive")
if archive:
extr = job.extractor
archive = util.expand_path(archive)
archive_format = (
options.get("archive-prefix", extr.category) +
options.get("archive-format", "_MD_" + extr.archive_fmt))
try:
if "{" in archive:
archive = formatter.parse(archive).format_map(
job.pathfmt.kwdict)
self.archive = util.DownloadArchive(
archive, archive_format, "_archive_metadata")
except Exception as exc:
self.log.warning(
"Failed to open download archive at '%s' ('%s: %s')",
archive, exc.__class__.__name__, exc)
else:
self.log.debug("Using download archive '%s'", archive)
else:
self.archive = None
self.mtime = options.get("mtime")
def run(self, pathfmt):
archive = self.archive
if archive and archive.check(pathfmt.kwdict):
return
directory = self._directory(pathfmt)
path = directory + self._filename(pathfmt)
@ -73,6 +99,9 @@ class MetadataPP(PostProcessor):
with open(path, "w", encoding="utf-8") as fp:
self.write(fp, pathfmt.kwdict)
if archive:
archive.add(pathfmt.kwdict)
if self.mtime:
mtime = pathfmt.kwdict.get("_mtime")
if mtime:

@ -672,11 +672,14 @@ class ExtendedUrl():
class DownloadArchive():
def __init__(self, path, extractor):
def __init__(self, path, format_string, cache_key="_archive_key"):
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
con.isolation_level = None
self.close = con.close
self.cursor = con.cursor()
self.keygen = format_string.format_map
self._cache_key = cache_key
try:
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
@ -685,20 +688,16 @@ class DownloadArchive():
# fallback for missing WITHOUT ROWID support (#553)
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry PRIMARY KEY)")
self.keygen = (
extractor.config("archive-prefix", extractor.category) +
extractor.config("archive-format", extractor.archive_fmt)
).format_map
def check(self, kwdict):
"""Return True if the item described by 'kwdict' exists in archive"""
key = kwdict["_archive_key"] = self.keygen(kwdict)
key = kwdict[self._cache_key] = self.keygen(kwdict)
self.cursor.execute(
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
return self.cursor.fetchone()
def add(self, kwdict):
"""Add item described by 'kwdict' to archive"""
key = kwdict.get("_archive_key") or self.keygen(kwdict)
key = kwdict.get(self._cache_key) or self.keygen(kwdict)
self.cursor.execute(
"INSERT OR IGNORE INTO archive VALUES (?)", (key,))