allow specifying a minimum/maximum for 'sleep-*' options (#1835)
for example '"sleep-request": [5.0, 10.0]' to wait between 5 and 10 seconds between each HTTP request
This commit is contained in:
parent
bd845303ad
commit
c9e6693530
@ -314,7 +314,7 @@ Description
|
||||
extractor.*.sleep
|
||||
-----------------
|
||||
Type
|
||||
``float``
|
||||
|Duration|_
|
||||
Default
|
||||
``0``
|
||||
Description
|
||||
@ -324,7 +324,7 @@ Description
|
||||
extractor.*.sleep-extractor
|
||||
---------------------------
|
||||
Type
|
||||
``float``
|
||||
|Duration|_
|
||||
Default
|
||||
``0``
|
||||
Description
|
||||
@ -335,7 +335,7 @@ Description
|
||||
extractor.*.sleep-request
|
||||
-------------------------
|
||||
Type
|
||||
``float``
|
||||
|Duration|_
|
||||
Default
|
||||
``0``
|
||||
Description
|
||||
@ -3167,7 +3167,8 @@ Custom Types
|
||||
Date
|
||||
----
|
||||
Type
|
||||
``string`` or ``integer``
|
||||
* ``string``
|
||||
* ``integer``
|
||||
Example
|
||||
* ``"2019-01-01T00:00:00"``
|
||||
* ``"2019"`` with ``"%Y"`` as `date-format`_
|
||||
@ -3179,10 +3180,28 @@ Description
|
||||
* If given as ``integer``, it is interpreted as UTC timestamp.
|
||||
|
||||
|
||||
Duration
|
||||
--------
|
||||
Type
|
||||
* ``float``
|
||||
* ``list`` with 2 ``floats``
|
||||
Example
|
||||
* ``2.85``
|
||||
* ``[1.5, 3.0]``
|
||||
Description
|
||||
A |Duration|_ represents a span of time in seconds.
|
||||
|
||||
* If given as a single ``float``, it will be used as that exact value.
|
||||
* If given as a ``list`` with 2 floating-point numbers ``a`` & ``b`` ,
|
||||
it will be randomly chosen with uniform distribution such that ``a <= N <=b``.
|
||||
(see `random.uniform() <https://docs.python.org/3/library/random.html#random.uniform>`_)
|
||||
|
||||
|
||||
Path
|
||||
----
|
||||
Type
|
||||
``string`` or ``list`` of ``strings``
|
||||
* ``string``
|
||||
* ``list`` of ``strings``
|
||||
Example
|
||||
* ``"file.ext"``
|
||||
* ``"~/path/to/file.ext"``
|
||||
@ -3328,6 +3347,7 @@ Description
|
||||
.. |datetime| replace:: ``datetime``
|
||||
.. |datetime.max| replace:: ``datetime.max``
|
||||
.. |Date| replace:: ``Date``
|
||||
.. |Duration| replace:: ``Duration``
|
||||
.. |Path| replace:: ``Path``
|
||||
.. |Last-Modified| replace:: ``Last-Modified``
|
||||
.. |Logging Configuration| replace:: ``Logging Configuration``
|
||||
|
@ -54,13 +54,13 @@ class Extractor():
|
||||
self._retries = self.config("retries", 4)
|
||||
self._timeout = self.config("timeout", 30)
|
||||
self._verify = self.config("verify", True)
|
||||
self.request_interval = self.config(
|
||||
"sleep-request", self.request_interval)
|
||||
self._interval = util.build_duration_func(
|
||||
self.config("sleep-request", self.request_interval),
|
||||
self.request_interval_min,
|
||||
)
|
||||
|
||||
if self._retries < 0:
|
||||
self._retries = float("inf")
|
||||
if self.request_interval < self.request_interval_min:
|
||||
self.request_interval = self.request_interval_min
|
||||
|
||||
self._init_session()
|
||||
self._init_cookies()
|
||||
@ -114,8 +114,8 @@ class Extractor():
|
||||
response = None
|
||||
tries = 1
|
||||
|
||||
if self.request_interval:
|
||||
seconds = (self.request_interval -
|
||||
if self._interval:
|
||||
seconds = (self._interval() -
|
||||
(time.time() - Extractor.request_timestamp))
|
||||
if seconds > 0.0:
|
||||
self.log.debug("Sleeping for %.5s seconds", seconds)
|
||||
|
@ -72,9 +72,9 @@ class Job():
|
||||
log = extractor.log
|
||||
msg = None
|
||||
|
||||
sleep = extractor.config("sleep-extractor")
|
||||
sleep = util.build_duration_func(extractor.config("sleep-extractor"))
|
||||
if sleep:
|
||||
time.sleep(sleep)
|
||||
time.sleep(sleep())
|
||||
|
||||
try:
|
||||
for msg in extractor:
|
||||
@ -236,7 +236,7 @@ class DownloadJob(Job):
|
||||
return
|
||||
|
||||
if self.sleep:
|
||||
time.sleep(self.sleep)
|
||||
time.sleep(self.sleep())
|
||||
|
||||
# download from URL
|
||||
if not self.download(url):
|
||||
@ -398,7 +398,7 @@ class DownloadJob(Job):
|
||||
if kwdict:
|
||||
pathfmt.set_directory(kwdict)
|
||||
|
||||
self.sleep = cfg("sleep")
|
||||
self.sleep = util.build_duration_func(cfg("sleep"))
|
||||
self.fallback = cfg("fallback", True)
|
||||
if not cfg("download", True):
|
||||
# monkey-patch method to do nothing and always return True
|
||||
@ -541,7 +541,7 @@ class SimulationJob(DownloadJob):
|
||||
self.pathfmt.set_filename(kwdict)
|
||||
self.out.skip(self.pathfmt.path)
|
||||
if self.sleep:
|
||||
time.sleep(self.sleep)
|
||||
time.sleep(self.sleep())
|
||||
if self.archive:
|
||||
self.archive.add(kwdict)
|
||||
|
||||
@ -695,9 +695,10 @@ class DataJob(Job):
|
||||
self.filter = util.identity if private else util.filter_dict
|
||||
|
||||
def run(self):
|
||||
sleep = self.extractor.config("sleep-extractor")
|
||||
sleep = util.build_duration_func(
|
||||
self.extractor.config("sleep-extractor"))
|
||||
if sleep:
|
||||
time.sleep(sleep)
|
||||
time.sleep(sleep())
|
||||
|
||||
# collect data
|
||||
try:
|
||||
|
@ -409,6 +409,24 @@ def compile_expression(expr, name="<expr>", globals=GLOBALS):
|
||||
return functools.partial(eval, code_object, globals)
|
||||
|
||||
|
||||
def build_duration_func(duration, min=0.0):
|
||||
if not duration:
|
||||
return None
|
||||
|
||||
try:
|
||||
lower, upper = duration
|
||||
except TypeError:
|
||||
pass
|
||||
else:
|
||||
return functools.partial(
|
||||
random.uniform,
|
||||
lower if lower > min else min,
|
||||
upper if upper > min else min,
|
||||
)
|
||||
|
||||
return functools.partial(identity, duration if duration > min else min)
|
||||
|
||||
|
||||
def build_predicate(predicates):
|
||||
if not predicates:
|
||||
return lambda url, kwdict: True
|
||||
|
Loading…
x
Reference in New Issue
Block a user