[postprocessor:metadata] implement archive options (#2421)
'archive', 'archive-format', and 'archive-prefix'
This commit is contained in:
parent
be3492776b
commit
9bd27b1b8d
@ -599,7 +599,9 @@ Description
|
||||
memory requirements are significantly lower when the
|
||||
amount of stored IDs gets reasonably large.
|
||||
|
||||
Note: archive paths support regular `format string`_ replacements,
|
||||
Note: Archive files that do not already exist get generated automatically.
|
||||
|
||||
Note: Archive paths support regular `format string`_ replacements,
|
||||
but be aware that using external inputs for building local paths
|
||||
may pose a security risk.
|
||||
|
||||
@ -3139,6 +3141,19 @@ Description
|
||||
Note: Only applies for ``"mode": "custom"``.
|
||||
|
||||
|
||||
metadata.archive
|
||||
----------------
|
||||
Type
|
||||
|Path|_
|
||||
Description
|
||||
File to store IDs of generated metadata files in,
|
||||
similar to `extractor.*.archive`_.
|
||||
|
||||
``archive-format`` and ``archive-prefix`` options,
|
||||
akin to `extractor.*.archive-format`_ and `extractor.*.archive-prefix`_,
|
||||
are supported as well.
|
||||
|
||||
|
||||
metadata.mtime
|
||||
--------------
|
||||
Type
|
||||
|
@ -389,8 +389,10 @@ class DownloadJob(Job):
|
||||
|
||||
def initialize(self, kwdict=None):
|
||||
"""Delayed initialization of PathFormat, etc."""
|
||||
cfg = self.extractor.config
|
||||
pathfmt = self.pathfmt = path.PathFormat(self.extractor)
|
||||
extr = self.extractor
|
||||
cfg = extr.config
|
||||
|
||||
pathfmt = self.pathfmt = path.PathFormat(extr)
|
||||
if kwdict:
|
||||
pathfmt.set_directory(kwdict)
|
||||
|
||||
@ -403,17 +405,18 @@ class DownloadJob(Job):
|
||||
archive = cfg("archive")
|
||||
if archive:
|
||||
archive = util.expand_path(archive)
|
||||
archive_format = (cfg("archive-prefix", extr.category) +
|
||||
cfg("archive-format", extr.archive_fmt))
|
||||
try:
|
||||
if "{" in archive:
|
||||
archive = formatter.parse(archive).format_map(kwdict)
|
||||
self.archive = util.DownloadArchive(archive, self.extractor)
|
||||
self.archive = util.DownloadArchive(archive, archive_format)
|
||||
except Exception as exc:
|
||||
self.extractor.log.warning(
|
||||
extr.log.warning(
|
||||
"Failed to open download archive at '%s' ('%s: %s')",
|
||||
archive, exc.__class__.__name__, exc)
|
||||
else:
|
||||
self.extractor.log.debug(
|
||||
"Using download archive '%s'", archive)
|
||||
extr.log.debug("Using download archive '%s'", archive)
|
||||
|
||||
skip = cfg("skip", True)
|
||||
if skip:
|
||||
@ -435,7 +438,7 @@ class DownloadJob(Job):
|
||||
if self.archive:
|
||||
self.archive.check = pathfmt.exists
|
||||
|
||||
postprocessors = self.extractor.config_accumulate("postprocessors")
|
||||
postprocessors = extr.config_accumulate("postprocessors")
|
||||
if postprocessors:
|
||||
self.hooks = collections.defaultdict(list)
|
||||
pp_log = self.get_logger("postprocessor")
|
||||
@ -453,7 +456,7 @@ class DownloadJob(Job):
|
||||
clist = pp_dict.get("blacklist")
|
||||
negate = True
|
||||
if clist and not util.build_extractor_filter(
|
||||
clist, negate)(self.extractor):
|
||||
clist, negate)(extr):
|
||||
continue
|
||||
|
||||
name = pp_dict.get("name")
|
||||
@ -471,8 +474,7 @@ class DownloadJob(Job):
|
||||
pp_list.append(pp_obj)
|
||||
|
||||
if pp_list:
|
||||
self.extractor.log.debug(
|
||||
"Active postprocessor modules: %s", pp_list)
|
||||
extr.log.debug("Active postprocessor modules: %s", pp_list)
|
||||
if "init" in self.hooks:
|
||||
for callback in self.hooks["init"]:
|
||||
callback(pathfmt)
|
||||
|
@ -59,9 +59,35 @@ class MetadataPP(PostProcessor):
|
||||
events = events.split(",")
|
||||
job.register_hooks({event: self.run for event in events}, options)
|
||||
|
||||
archive = options.get("archive")
|
||||
if archive:
|
||||
extr = job.extractor
|
||||
archive = util.expand_path(archive)
|
||||
archive_format = (
|
||||
options.get("archive-prefix", extr.category) +
|
||||
options.get("archive-format", "_MD_" + extr.archive_fmt))
|
||||
try:
|
||||
if "{" in archive:
|
||||
archive = formatter.parse(archive).format_map(
|
||||
job.pathfmt.kwdict)
|
||||
self.archive = util.DownloadArchive(
|
||||
archive, archive_format, "_archive_metadata")
|
||||
except Exception as exc:
|
||||
self.log.warning(
|
||||
"Failed to open download archive at '%s' ('%s: %s')",
|
||||
archive, exc.__class__.__name__, exc)
|
||||
else:
|
||||
self.log.debug("Using download archive '%s'", archive)
|
||||
else:
|
||||
self.archive = None
|
||||
|
||||
self.mtime = options.get("mtime")
|
||||
|
||||
def run(self, pathfmt):
|
||||
archive = self.archive
|
||||
if archive and archive.check(pathfmt.kwdict):
|
||||
return
|
||||
|
||||
directory = self._directory(pathfmt)
|
||||
path = directory + self._filename(pathfmt)
|
||||
|
||||
@ -73,6 +99,9 @@ class MetadataPP(PostProcessor):
|
||||
with open(path, "w", encoding="utf-8") as fp:
|
||||
self.write(fp, pathfmt.kwdict)
|
||||
|
||||
if archive:
|
||||
archive.add(pathfmt.kwdict)
|
||||
|
||||
if self.mtime:
|
||||
mtime = pathfmt.kwdict.get("_mtime")
|
||||
if mtime:
|
||||
|
@ -672,11 +672,14 @@ class ExtendedUrl():
|
||||
|
||||
class DownloadArchive():
|
||||
|
||||
def __init__(self, path, extractor):
|
||||
def __init__(self, path, format_string, cache_key="_archive_key"):
|
||||
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
|
||||
con.isolation_level = None
|
||||
|
||||
self.close = con.close
|
||||
self.cursor = con.cursor()
|
||||
self.keygen = format_string.format_map
|
||||
self._cache_key = cache_key
|
||||
|
||||
try:
|
||||
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
|
||||
@ -685,20 +688,16 @@ class DownloadArchive():
|
||||
# fallback for missing WITHOUT ROWID support (#553)
|
||||
self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
|
||||
"(entry PRIMARY KEY)")
|
||||
self.keygen = (
|
||||
extractor.config("archive-prefix", extractor.category) +
|
||||
extractor.config("archive-format", extractor.archive_fmt)
|
||||
).format_map
|
||||
|
||||
def check(self, kwdict):
|
||||
"""Return True if the item described by 'kwdict' exists in archive"""
|
||||
key = kwdict["_archive_key"] = self.keygen(kwdict)
|
||||
key = kwdict[self._cache_key] = self.keygen(kwdict)
|
||||
self.cursor.execute(
|
||||
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
|
||||
return self.cursor.fetchone()
|
||||
|
||||
def add(self, kwdict):
|
||||
"""Add item described by 'kwdict' to archive"""
|
||||
key = kwdict.get("_archive_key") or self.keygen(kwdict)
|
||||
key = kwdict.get(self._cache_key) or self.keygen(kwdict)
|
||||
self.cursor.execute(
|
||||
"INSERT OR IGNORE INTO archive VALUES (?)", (key,))
|
||||
|
Loading…
x
Reference in New Issue
Block a user