[tumblr] enable date-min/-max/-format options (#337)

This commit is contained in:
Mike Fährmann 2019-07-16 23:08:27 +02:00
parent 09f37fde39
commit 8d1ae9b715
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
2 changed files with 52 additions and 20 deletions

View File

@ -420,6 +420,18 @@ Description Like `image-unique`__, but applies to delegated URLs
__ `extractor.*.image-unique`_
extractor.*.date-format
----------------------------
=========== =====
Type ``string``
Default ``"%Y-%m-%dT%H:%M:%S"``
Description Format string used to parse ``string`` values of
`date-min` and `date-max`.
See |strptime|_ for a list of formatting directives.
=========== =====
Extractor-specific Options
==========================
@ -776,24 +788,9 @@ Description Retrieve additional comments by resolving the ``more`` comment
extractor.reddit.date-min & .date-max
-------------------------------------
=========== =====
Type ``integer`` or ``string``
Type |Date|_
Default ``0`` and ``253402210800`` (timestamp of |datetime.max|_)
Description Ignore all submissions posted before/after this date.
* If this is an ``integer``, it represents the date as UTC timestamp.
* If this is a ``string``, it will get parsed according to date-format_.
=========== =====
extractor.reddit.date-format
----------------------------
=========== =====
Type ``string``
Default ``"%Y-%m-%dT%H:%M:%S"``
Description An explicit format string used to parse the ``string`` values of
`date-min and date-max`_.
See |strptime|_ for a list of formatting directives.
=========== =====
@ -870,6 +867,15 @@ Description Download blog avatars.
=========== =====
extractor.tumblr.date-min & .date-max
-------------------------------------
=========== =====
Type |Date|_
Default ``0`` and ``null``
Description Ignore all posts published before/after this date.
=========== =====
extractor.tumblr.external
-------------------------
=========== =====
@ -1546,6 +1552,20 @@ Custom Types
============
Date
----
=========== =====
Type ``string`` or ``integer``
Examples * ``"2019-01-01T00:00:00"``
* ``"2019"`` with ``"%Y"`` as date-format_
* ``1546297200``
Description A |Date|_ value represents a specific point in time.
* If given as ``string``, it is parsed according to date-format_.
* If given as ``integer``, it is interpreted as UTC timestamp.
=========== =====
Path
----
=========== =====
@ -1667,6 +1687,7 @@ Description An object with the ``name`` of a post-processor and its options.
.. |webbrowser.open()| replace:: ``webbrowser.open()``
.. |datetime| replace:: ``datetime``
.. |datetime.max| replace:: ``datetime.max``
.. |Date| replace:: ``Date``
.. |Path| replace:: ``Path``
.. |Last-Modified| replace:: ``Last-Modified``
.. |Logging Configuration| replace:: ``Logging Configuration``
@ -1675,8 +1696,7 @@ Description An object with the ``name`` of a post-processor and its options.
.. _base-directory: `extractor.*.base-directory`_
.. _skipped: `extractor.*.skip`_
.. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_
.. _date-format: extractor.reddit.date-format_
.. _date-format: `extractor.*.date-format`_
.. _deviantart.metadata: extractor.deviantart.metadata_
.. _.netrc: https://stackoverflow.com/tags/.netrc/info

View File

@ -65,11 +65,15 @@ class TumblrExtractor(Extractor):
if self.reblogs == "same-blog":
self._skip_reblog = self._skip_reblog_same_blog
self.date_min, self.api.before = self._get_date_min_max(0, None)
def items(self):
blog = None
yield Message.Version, 1
for post in self.posts():
if self.date_min > post["timestamp"]:
return
if post["type"] not in self.types:
continue
if not blog:
@ -223,6 +227,11 @@ class TumblrUserExtractor(TumblrExtractor):
"count": 2,
"keyword": {"tags": ["test", "private", "hidden"]},
}),
("https://mikf123.tumblr.com/", { # date-min/-max/-format (#337)
"count": 4,
"options": (("date-min", "201804"), ("date-max", "201805"),
("date-format", "%Y%m"))
}),
("https://demo.tumblr.com/page/2"),
("https://demo.tumblr.com/archive"),
("tumblr:http://www.b-authentique.com/"),
@ -280,6 +289,7 @@ class TumblrPostExtractor(TumblrExtractor):
TumblrExtractor.__init__(self, match)
self.post_id = match.group(3)
self.reblogs = True
self.date_min = 0
def posts(self):
return self.api.posts(self.blog, {"id": self.post_id})
@ -328,7 +338,7 @@ class TumblrAPI(oauth.OAuth1API):
def __init__(self, extractor):
oauth.OAuth1API.__init__(self, extractor)
self.posts_type = None
self.posts_type = self.before = None
def info(self, blog):
"""Return general information about a blog"""
@ -350,6 +360,8 @@ class TumblrAPI(oauth.OAuth1API):
params.update({"offset": 0, "limit": 50, "reblog_info": "true"})
if self.posts_type:
params["type"] = self.posts_type
if self.before:
params["before"] = self.before
while True:
data = self._call(blog, "posts", params)
self.BLOG_CACHE[blog] = data["blog"]
@ -360,7 +372,7 @@ class TumblrAPI(oauth.OAuth1API):
def likes(self, blog):
"""Retrieve liked posts"""
params = {"limit": 50}
params = {"limit": "50", "before": self.before}
while True:
posts = self._call(blog, "likes", params)["liked_posts"]
if not posts: