allow specifying a minimum/maximum for 'sleep-*' options (#1835)

for example '"sleep-request": [5.0, 10.0]' to wait between 5 and 10
seconds between each HTTP request
This commit is contained in:
Mike Fährmann 2021-09-14 17:40:05 +02:00
parent bd845303ad
commit c9e6693530
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 57 additions and 18 deletions

View File

@ -314,7 +314,7 @@ Description
extractor.*.sleep
-----------------
Type
``float``
|Duration|_
Default
``0``
Description
@ -324,7 +324,7 @@ Description
extractor.*.sleep-extractor
---------------------------
Type
``float``
|Duration|_
Default
``0``
Description
@ -335,7 +335,7 @@ Description
extractor.*.sleep-request
-------------------------
Type
``float``
|Duration|_
Default
``0``
Description
@ -3167,7 +3167,8 @@ Custom Types
Date
----
Type
``string`` or ``integer``
* ``string``
* ``integer``
Example
* ``"2019-01-01T00:00:00"``
* ``"2019"`` with ``"%Y"`` as `date-format`_
@ -3179,10 +3180,28 @@ Description
* If given as ``integer``, it is interpreted as UTC timestamp.
Duration
--------
Type
* ``float``
* ``list`` with 2 ``floats``
Example
* ``2.85``
* ``[1.5, 3.0]``
Description
A |Duration|_ represents a span of time in seconds.
* If given as a single ``float``, it will be used as that exact value.
* If given as a ``list`` with 2 floating-point numbers ``a`` & ``b`` ,
it will be randomly chosen with uniform distribution such that ``a <= N <=b``.
(see `random.uniform() <https://docs.python.org/3/library/random.html#random.uniform>`_)
Path
----
Type
``string`` or ``list`` of ``strings``
* ``string``
* ``list`` of ``strings``
Example
* ``"file.ext"``
* ``"~/path/to/file.ext"``
@ -3328,6 +3347,7 @@ Description
.. |datetime| replace:: ``datetime``
.. |datetime.max| replace:: ``datetime.max``
.. |Date| replace:: ``Date``
.. |Duration| replace:: ``Duration``
.. |Path| replace:: ``Path``
.. |Last-Modified| replace:: ``Last-Modified``
.. |Logging Configuration| replace:: ``Logging Configuration``

View File

@ -54,13 +54,13 @@ class Extractor():
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
self.request_interval = self.config(
"sleep-request", self.request_interval)
self._interval = util.build_duration_func(
self.config("sleep-request", self.request_interval),
self.request_interval_min,
)
if self._retries < 0:
self._retries = float("inf")
if self.request_interval < self.request_interval_min:
self.request_interval = self.request_interval_min
self._init_session()
self._init_cookies()
@ -114,8 +114,8 @@ class Extractor():
response = None
tries = 1
if self.request_interval:
seconds = (self.request_interval -
if self._interval:
seconds = (self._interval() -
(time.time() - Extractor.request_timestamp))
if seconds > 0.0:
self.log.debug("Sleeping for %.5s seconds", seconds)

View File

@ -72,9 +72,9 @@ class Job():
log = extractor.log
msg = None
sleep = extractor.config("sleep-extractor")
sleep = util.build_duration_func(extractor.config("sleep-extractor"))
if sleep:
time.sleep(sleep)
time.sleep(sleep())
try:
for msg in extractor:
@ -236,7 +236,7 @@ class DownloadJob(Job):
return
if self.sleep:
time.sleep(self.sleep)
time.sleep(self.sleep())
# download from URL
if not self.download(url):
@ -398,7 +398,7 @@ class DownloadJob(Job):
if kwdict:
pathfmt.set_directory(kwdict)
self.sleep = cfg("sleep")
self.sleep = util.build_duration_func(cfg("sleep"))
self.fallback = cfg("fallback", True)
if not cfg("download", True):
# monkey-patch method to do nothing and always return True
@ -541,7 +541,7 @@ class SimulationJob(DownloadJob):
self.pathfmt.set_filename(kwdict)
self.out.skip(self.pathfmt.path)
if self.sleep:
time.sleep(self.sleep)
time.sleep(self.sleep())
if self.archive:
self.archive.add(kwdict)
@ -695,9 +695,10 @@ class DataJob(Job):
self.filter = util.identity if private else util.filter_dict
def run(self):
sleep = self.extractor.config("sleep-extractor")
sleep = util.build_duration_func(
self.extractor.config("sleep-extractor"))
if sleep:
time.sleep(sleep)
time.sleep(sleep())
# collect data
try:

View File

@ -409,6 +409,24 @@ def compile_expression(expr, name="<expr>", globals=GLOBALS):
return functools.partial(eval, code_object, globals)
def build_duration_func(duration, min=0.0):
if not duration:
return None
try:
lower, upper = duration
except TypeError:
pass
else:
return functools.partial(
random.uniform,
lower if lower > min else min,
upper if upper > min else min,
)
return functools.partial(identity, duration if duration > min else min)
def build_predicate(predicates):
if not predicates:
return lambda url, kwdict: True