["deviantart] add 'journals' option

This commit is contained in:
Mike Fährmann 2018-07-16 18:14:41 +02:00
parent 00032b828c
commit ff436692bf
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 66 additions and 16 deletions

View File

@ -294,6 +294,19 @@ Description Select the directory structure created by the Gallery- and
=========== ===== =========== =====
extractor.deviantart.journals
-----------------------------
=========== =====
Type ``string``
Default ``"html"``
Description Selects the output format of journal entries.
- ``"html"``: HTML with (roughly) the same layout as on DeviantArt.
- ``"text"``: Plain text with image references and HTML tags removed.
- ``"none"``: Don't download journals.
=========== =====
extractor.deviantart.mature extractor.deviantart.mature
--------------------------- ---------------------------
=========== ===== =========== =====
@ -338,7 +351,7 @@ extractor.deviantart.wait-min
=========== ===== =========== =====
Type ``int`` Type ``int``
Default ``0`` Default ``0``
Description Minimum wait time in seconds before any API request. Description Minimum wait time in seconds before API requests.
Note: This value will internally be rounded up Note: This value will internally be rounded up
to the next power of 2. to the next power of 2.

View File

@ -18,8 +18,10 @@
{ {
"refresh-token": null, "refresh-token": null,
"flat": true, "flat": true,
"journals": "html",
"mature": true, "mature": true,
"original": true "original": true,
"wait-min": 0
}, },
"exhentai": "exhentai":
{ {

View File

@ -21,7 +21,7 @@ import re
BASE_PATTERN = ( BASE_PATTERN = (
r"(?:https?://)?(?:" r"(?:https?://)?(?:"
r"(?:www\.)?deviantart\.com/([\w-]+)|" r"(?:www\.)?deviantart\.com/([\w-]+)|"
r"(?!www\.)([\w-]+)\.deviantart\.com)" r"([\w-]+)\.deviantart\.com)"
) )
@ -41,6 +41,11 @@ class DeviantartExtractor(Extractor):
self.user = match.group(1) or match.group(2) if match else None self.user = match.group(1) or match.group(2) if match else None
self.group = False self.group = False
self.commit_journal = {
"html": self._commit_journal_html,
"text": self._commit_journal_text,
}.get(self.config("journals", "html"))
def skip(self, num): def skip(self, num):
self.offset += num self.offset += num
return num return num
@ -77,7 +82,7 @@ class DeviantartExtractor(Extractor):
if "flash" in deviation: if "flash" in deviation:
yield self.commit(deviation, deviation["flash"]) yield self.commit(deviation, deviation["flash"])
if "excerpt" in deviation: if "excerpt" in deviation and self.commit_journal:
journal = self.api.deviation_content(deviation["deviationid"]) journal = self.api.deviation_content(deviation["deviationid"])
yield self.commit_journal(deviation, journal) yield self.commit_journal(deviation, journal)
@ -94,7 +99,6 @@ class DeviantartExtractor(Extractor):
deviation["index"] = deviation["url"].rpartition("-")[2] deviation["index"] = deviation["url"].rpartition("-")[2]
except KeyError: except KeyError:
deviation["index"] = 0 deviation["index"] = 0
if self.user: if self.user:
deviation["username"] = self.user deviation["username"] = self.user
deviation["da_category"] = deviation["category"] deviation["da_category"] = deviation["category"]
@ -108,7 +112,7 @@ class DeviantartExtractor(Extractor):
url = "https:" + url[5:] url = "https:" + url[5:]
return Message.Url, url, deviation return Message.Url, url, deviation
def commit_journal(self, deviation, journal): def _commit_journal_html(self, deviation, journal):
title = text.escape(deviation["title"]) title = text.escape(deviation["title"])
url = deviation["url"] url = deviation["url"]
thumbs = deviation["thumbs"] thumbs = deviation["thumbs"]
@ -142,11 +146,11 @@ class DeviantartExtractor(Extractor):
url=url, url=url,
userurl="{}/{}/".format(self.root, deviation["username"]), userurl="{}/{}/".format(self.root, deviation["username"]),
username=deviation["author"]["username"], username=deviation["author"]["username"],
date=str(date), date=date,
categories=categories, categories=categories,
) )
html = JOURNAL_TEMPLATE.format( html = JOURNAL_TEMPLATE_HTML.format(
title=title, title=title,
html=html.replace(needle, header, 1), html=html.replace(needle, header, 1),
shadow=shadow, shadow=shadow,
@ -157,6 +161,23 @@ class DeviantartExtractor(Extractor):
deviation["extension"] = "htm" deviation["extension"] = "htm"
return Message.Url, html, deviation return Message.Url, html, deviation
@staticmethod
def _commit_journal_text(deviation, journal):
date = datetime.datetime.utcfromtimestamp(deviation["published_time"])
content = "\n".join(
text.unescape(text.remove_html(txt))
for txt in journal["html"].rpartition("<script")[0].split("<br />")
)
txt = JOURNAL_TEMPLATE_TEXT.format(
title=deviation["title"],
username=deviation["author"]["username"],
date=date,
content=content,
)
deviation["extension"] = "txt"
return Message.Url, txt, deviation
@staticmethod @staticmethod
def _find_folder(folders, name): def _find_folder(folders, name):
pattern = r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$" pattern = r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$"
@ -246,12 +267,12 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
subcategory = "deviation" subcategory = "deviation"
archive_fmt = "{index}.{extension}" archive_fmt = "{index}.{extension}"
pattern = [BASE_PATTERN + r"/(?:art|journal)/[^/?&#]+-\d+", pattern = [BASE_PATTERN + r"/(?:art|journal)/[^/?&#]+-\d+",
r"(?:https?://)?(sta\.sh/[a-z0-9]+)"] r"(?:https?://)?sta\.sh/()()[a-z0-9]+"]
test = [ test = [
(("https://www.deviantart.com/shimoda7/art/" (("https://www.deviantart.com/shimoda7/art/"
"For-the-sake-of-a-memory-10073852"), { "For-the-sake-of-a-memory-10073852"), {
"url": "eef0c01b3808c535ea673e7b3654ab5209b910b7", "url": "eef0c01b3808c535ea673e7b3654ab5209b910b7",
"keyword": "b7ed053c3fb54b93c90e5ff8ed9f7a11d47a9c74", "keyword": "925217229da46aeb8ce282675dc8639fa20a892c",
"content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
}), }),
("https://www.deviantart.com/zzz/art/zzz-1234567890", { ("https://www.deviantart.com/zzz/art/zzz-1234567890", {
@ -277,7 +298,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
] ]
def __init__(self, match): def __init__(self, match):
DeviantartExtractor.__init__(self) DeviantartExtractor.__init__(self, match)
self.url = match.group(0) self.url = match.group(0)
if not self.url.startswith("http"): if not self.url.startswith("http"):
self.url = "https://" + self.url self.url = "https://" + self.url
@ -310,10 +331,10 @@ class DeviantartFavoriteExtractor(DeviantartExtractor):
def deviations(self): def deviations(self):
folders = self.api.collections_folders(self.user) folders = self.api.collections_folders(self.user)
if self.flat: if self.flat:
return itertools.chain.from_iterable([ return itertools.chain.from_iterable(
self.api.collections(self.user, folder["folderid"]) self.api.collections(self.user, folder["folderid"])
for folder in folders for folder in folders
]) )
else: else:
return self._folder_urls(folders, "favourites") return self._folder_urls(folders, "favourites")
@ -363,6 +384,14 @@ class DeviantartJournalExtractor(DeviantartExtractor):
"url": "38db2a0d3a587a7e0f9dba7ff7d274610ebefe44", "url": "38db2a0d3a587a7e0f9dba7ff7d274610ebefe44",
"keyword": "8d11b458f389188cc1f00d09694ce4e00c43efcc", "keyword": "8d11b458f389188cc1f00d09694ce4e00c43efcc",
}), }),
("https://www.deviantart.com/angrywhitewanker/journal/", {
"url": "b2a8e74d275664b1a4acee0fca0a6fd33298571e",
"options": (("journals", "text"),),
}),
("https://www.deviantart.com/angrywhitewanker/journal/", {
"count": 0,
"options": (("journals", "none"),),
}),
("https://www.deviantart.com/shimoda7/journal/?catpath=/", None), ("https://www.deviantart.com/shimoda7/journal/?catpath=/", None),
("https://angrywhitewanker.deviantart.com/journal/", None), ("https://angrywhitewanker.deviantart.com/journal/", None),
("https://shimoda7.deviantart.com/journal/?catpath=/", None), ("https://shimoda7.deviantart.com/journal/?catpath=/", None),
@ -629,7 +658,7 @@ HEADER_CUSTOM_TEMPLATE = """<div class='boxtop journaltop'>
Journal Entry: <span>{date}</span> Journal Entry: <span>{date}</span>
""" """
JOURNAL_TEMPLATE = """text:<!DOCTYPE html> JOURNAL_TEMPLATE_HTML = """text:<!DOCTYPE html>
<html> <html>
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
@ -676,3 +705,9 @@ roses/cssmin/desktop.css?1491362542749" >
</body> </body>
</html> </html>
""" """
JOURNAL_TEMPLATE_TEXT = """text:{title}
by {username}, {date}
{content}
"""

View File

@ -155,8 +155,8 @@ class ImgurAlbumExtractor(ImgurExtractor):
("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash
"url": "695ef0c950023362a0163ee5041796300db76674", "url": "695ef0c950023362a0163ee5041796300db76674",
}), }),
("https://imgur.com/t/unmuted/FVyxO32", { # unmuted URL ("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL
"url": "1df12d96438ad9018ace7665dc893419ce9ec867", "url": "86b4747f8147cec7602f0214e267309af73a8655",
}), }),
("https://imgur.com/a/TcBmQ", { ("https://imgur.com/a/TcBmQ", {
"exception": exception.NotFoundError, "exception": exception.NotFoundError,