[sankakucomplex] extract videos and embeds (closes #308)

This commit is contained in:
Mike Fährmann 2020-10-30 00:53:11 +01:00
parent c3f01dc4e6
commit 98a4d86a01
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
2 changed files with 70 additions and 29 deletions

View File

@ -1351,6 +1351,26 @@ Description
Waiting a few seconds between each request tries to prevent that.
extractor.sankakucomplex.embeds
-------------------------------
Type
``bool``
Default
``false``
Description
Download video embeds from external sites.
extractor.sankakucomplex.videos
-------------------------------
Type
``bool``
Default
``true``
Description
Download videos.
extractor.smugmug.videos
------------------------
Type

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2019 Mike Fährmann
# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -9,7 +9,7 @@
"""Extractors for https://www.sankakucomplex.com/"""
from .common import Extractor, Message
from .. import text
from .. import text, util
import re
@ -40,6 +40,21 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
"url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
"keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68",
}),
# videos (#308)
(("https://www.sankakucomplex.com/2019/06/11"
"/darling-ol-goddess-shows-off-her-plump-lower-area/"), {
"pattern": r"/wp-content/uploads/2019/06/[^/]+\d\.mp4",
"range": "26-",
"count": 5,
}),
# youtube embeds (#308)
(("https://www.sankakucomplex.com/2015/02/12"
"/snow-miku-2015-live-magical-indeed/"), {
"options": (("embeds", True),),
"pattern": r"https://www.youtube.com/embed/",
"range": "2-",
"count": 2,
}),
)
def items(self):
@ -53,38 +68,44 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
"date" : text.parse_datetime(
extr('property="article:published_time" content="', '"')),
}
imgs = self.images(extr)
data["count"] = len(imgs)
content = extr('<div class="entry-content">', '</article>')
data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2]
yield Message.Version, 1
files = self._extract_images(content)
if self.config("videos", True):
files += self._extract_videos(content)
if self.config("embeds", False):
files += self._extract_embeds(content)
data["count"] = len(files)
yield Message.Directory, data
for img in imgs:
img.update(data)
yield Message.Url, img["url"], img
def images(self, extr):
num = 0
imgs = []
urls = set()
orig = re.compile(r"-\d+x\d+\.")
extr('<div class="entry-content">', '')
while True:
url = extr('data-lazy-src="', '"')
if not url:
return imgs
if url in urls:
continue
for num, url in enumerate(files, 1):
file = text.nameext_from_url(url)
if url[0] == "/":
url = text.urljoin(self.root, url)
url = orig.sub(".", url)
num += 1
imgs.append(text.nameext_from_url(url, {
"url" : url,
"num" : num,
}))
urls.add(url)
file["url"] = url
file["num"] = num
file.update(data)
yield Message.Url, url, file
@staticmethod
def _extract_images(content):
orig_sub = re.compile(r"-\d+x\d+\.").sub
return [
orig_sub(".", url) for url in
util.unique(text.extract_iter(content, 'data-lazy-src="', '"'))
]
@staticmethod
def _extract_videos(content):
return re.findall(r"<source [^>]*src=[\"']([^\"']+)", content)
@staticmethod
def _extract_embeds(content):
return [
"ytdl:" + url for url in
re.findall(r"<iframe [^>]*src=[\"']([^\"']+)", content)
]
class SankakucomplexTagExtractor(SankakucomplexExtractor):