[tumblr] enable date-min/-max/-format options (#337)

This commit is contained in:
Mike Fährmann 2019-07-16 23:08:27 +02:00
parent 09f37fde39
commit 8d1ae9b715
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
2 changed files with 52 additions and 20 deletions

View File

@ -420,6 +420,18 @@ Description Like `image-unique`__, but applies to delegated URLs
__ `extractor.*.image-unique`_ __ `extractor.*.image-unique`_
extractor.*.date-format
----------------------------
=========== =====
Type ``string``
Default ``"%Y-%m-%dT%H:%M:%S"``
Description Format string used to parse ``string`` values of
`date-min` and `date-max`.
See |strptime|_ for a list of formatting directives.
=========== =====
Extractor-specific Options Extractor-specific Options
========================== ==========================
@ -776,24 +788,9 @@ Description Retrieve additional comments by resolving the ``more`` comment
extractor.reddit.date-min & .date-max extractor.reddit.date-min & .date-max
------------------------------------- -------------------------------------
=========== ===== =========== =====
Type ``integer`` or ``string`` Type |Date|_
Default ``0`` and ``253402210800`` (timestamp of |datetime.max|_) Default ``0`` and ``253402210800`` (timestamp of |datetime.max|_)
Description Ignore all submissions posted before/after this date. Description Ignore all submissions posted before/after this date.
* If this is an ``integer``, it represents the date as UTC timestamp.
* If this is a ``string``, it will get parsed according to date-format_.
=========== =====
extractor.reddit.date-format
----------------------------
=========== =====
Type ``string``
Default ``"%Y-%m-%dT%H:%M:%S"``
Description An explicit format string used to parse the ``string`` values of
`date-min and date-max`_.
See |strptime|_ for a list of formatting directives.
=========== ===== =========== =====
@ -870,6 +867,15 @@ Description Download blog avatars.
=========== ===== =========== =====
extractor.tumblr.date-min & .date-max
-------------------------------------
=========== =====
Type |Date|_
Default ``0`` and ``null``
Description Ignore all posts published before/after this date.
=========== =====
extractor.tumblr.external extractor.tumblr.external
------------------------- -------------------------
=========== ===== =========== =====
@ -1546,6 +1552,20 @@ Custom Types
============ ============
Date
----
=========== =====
Type ``string`` or ``integer``
Examples * ``"2019-01-01T00:00:00"``
* ``"2019"`` with ``"%Y"`` as date-format_
* ``1546297200``
Description A |Date|_ value represents a specific point in time.
* If given as ``string``, it is parsed according to date-format_.
* If given as ``integer``, it is interpreted as UTC timestamp.
=========== =====
Path Path
---- ----
=========== ===== =========== =====
@ -1667,6 +1687,7 @@ Description An object with the ``name`` of a post-processor and its options.
.. |webbrowser.open()| replace:: ``webbrowser.open()`` .. |webbrowser.open()| replace:: ``webbrowser.open()``
.. |datetime| replace:: ``datetime`` .. |datetime| replace:: ``datetime``
.. |datetime.max| replace:: ``datetime.max`` .. |datetime.max| replace:: ``datetime.max``
.. |Date| replace:: ``Date``
.. |Path| replace:: ``Path`` .. |Path| replace:: ``Path``
.. |Last-Modified| replace:: ``Last-Modified`` .. |Last-Modified| replace:: ``Last-Modified``
.. |Logging Configuration| replace:: ``Logging Configuration`` .. |Logging Configuration| replace:: ``Logging Configuration``
@ -1675,8 +1696,7 @@ Description An object with the ``name`` of a post-processor and its options.
.. _base-directory: `extractor.*.base-directory`_ .. _base-directory: `extractor.*.base-directory`_
.. _skipped: `extractor.*.skip`_ .. _skipped: `extractor.*.skip`_
.. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_ .. _date-format: `extractor.*.date-format`_
.. _date-format: extractor.reddit.date-format_
.. _deviantart.metadata: extractor.deviantart.metadata_ .. _deviantart.metadata: extractor.deviantart.metadata_
.. _.netrc: https://stackoverflow.com/tags/.netrc/info .. _.netrc: https://stackoverflow.com/tags/.netrc/info

View File

@ -65,11 +65,15 @@ class TumblrExtractor(Extractor):
if self.reblogs == "same-blog": if self.reblogs == "same-blog":
self._skip_reblog = self._skip_reblog_same_blog self._skip_reblog = self._skip_reblog_same_blog
self.date_min, self.api.before = self._get_date_min_max(0, None)
def items(self): def items(self):
blog = None blog = None
yield Message.Version, 1 yield Message.Version, 1
for post in self.posts(): for post in self.posts():
if self.date_min > post["timestamp"]:
return
if post["type"] not in self.types: if post["type"] not in self.types:
continue continue
if not blog: if not blog:
@ -223,6 +227,11 @@ class TumblrUserExtractor(TumblrExtractor):
"count": 2, "count": 2,
"keyword": {"tags": ["test", "private", "hidden"]}, "keyword": {"tags": ["test", "private", "hidden"]},
}), }),
("https://mikf123.tumblr.com/", { # date-min/-max/-format (#337)
"count": 4,
"options": (("date-min", "201804"), ("date-max", "201805"),
("date-format", "%Y%m"))
}),
("https://demo.tumblr.com/page/2"), ("https://demo.tumblr.com/page/2"),
("https://demo.tumblr.com/archive"), ("https://demo.tumblr.com/archive"),
("tumblr:http://www.b-authentique.com/"), ("tumblr:http://www.b-authentique.com/"),
@ -280,6 +289,7 @@ class TumblrPostExtractor(TumblrExtractor):
TumblrExtractor.__init__(self, match) TumblrExtractor.__init__(self, match)
self.post_id = match.group(3) self.post_id = match.group(3)
self.reblogs = True self.reblogs = True
self.date_min = 0
def posts(self): def posts(self):
return self.api.posts(self.blog, {"id": self.post_id}) return self.api.posts(self.blog, {"id": self.post_id})
@ -328,7 +338,7 @@ class TumblrAPI(oauth.OAuth1API):
def __init__(self, extractor): def __init__(self, extractor):
oauth.OAuth1API.__init__(self, extractor) oauth.OAuth1API.__init__(self, extractor)
self.posts_type = None self.posts_type = self.before = None
def info(self, blog): def info(self, blog):
"""Return general information about a blog""" """Return general information about a blog"""
@ -350,6 +360,8 @@ class TumblrAPI(oauth.OAuth1API):
params.update({"offset": 0, "limit": 50, "reblog_info": "true"}) params.update({"offset": 0, "limit": 50, "reblog_info": "true"})
if self.posts_type: if self.posts_type:
params["type"] = self.posts_type params["type"] = self.posts_type
if self.before:
params["before"] = self.before
while True: while True:
data = self._call(blog, "posts", params) data = self._call(blog, "posts", params)
self.BLOG_CACHE[blog] = data["blog"] self.BLOG_CACHE[blog] = data["blog"]
@ -360,7 +372,7 @@ class TumblrAPI(oauth.OAuth1API):
def likes(self, blog): def likes(self, blog):
"""Retrieve liked posts""" """Retrieve liked posts"""
params = {"limit": 50} params = {"limit": "50", "before": self.before}
while True: while True:
posts = self._call(blog, "likes", params)["liked_posts"] posts = self._call(blog, "likes", params)["liked_posts"]
if not posts: if not posts: