[pillowfort] add 'inline' option (#846)

to support images present in a post's 'content',
but not listed in 'media'.

also separates the file hash present at the beginning
of each 'filename' into its own field.
This commit is contained in:
Mike Fährmann 2021-05-17 02:57:02 +02:00
parent efa6cc8ec3
commit a7e4917ee1
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
3 changed files with 38 additions and 6 deletions

View File

@ -1316,6 +1316,16 @@ Description
Follow links to external sites, e.g. Twitter,
extractor.pillowfort.inline
---------------------------
Type
``bool``
Default
``true``
Description
Extract inline images.
extractor.pillowfort.reblogs
----------------------------
Type

View File

@ -177,6 +177,7 @@
"pillowfort":
{
"external": false,
"inline": true,
"reblogs": false
},
"pinterest":

View File

@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text
import re
BASE_PATTERN = r"(?:https?://)?www\.pillowfort\.social"
@ -28,18 +29,26 @@ class PillowfortExtractor(Extractor):
self.item = match.group(1)
def items(self):
inline = self.config("inline", True)
reblogs = self.config("reblogs", False)
external = self.config("external", False)
if inline:
inline = re.compile(r'src="(https://img\d+\.pillowfort\.social'
r'/posts/[^"]+)').findall
for post in self.posts():
if "original_post" in post and not reblogs:
continue
files = post["media"]
del post["media"]
files = post.pop("media")
if inline:
for url in inline(post["content"]):
files.append({"url": url})
post["date"] = text.parse_datetime(
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
post["post_id"] = post.pop("id")
yield Message.Directory, post
post["num"] = 0
@ -57,9 +66,17 @@ class PillowfortExtractor(Extractor):
msgtype = Message.Url
post.update(file)
post["date"] = text.parse_datetime(
file["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
yield msgtype, url, text.nameext_from_url(url, post)
text.nameext_from_url(url, post)
post["hash"], _, post["filename"] = \
post["filename"].partition("_")
if "id" not in file:
post["id"] = post["hash"]
if "created_at" in file:
post["date"] = text.parse_datetime(
file["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
yield msgtype, url, post
class PillowfortPostExtractor(PillowfortExtractor):
@ -120,10 +137,14 @@ class PillowfortPostExtractor(PillowfortExtractor):
},
}),
("https://www.pillowfort.social/posts/1557500", {
"options": (("external", True),),
"options": (("external", True), ("inline", False)),
"pattern": r"https://twitter\.com/Aliciawitdaart/status"
r"/1282862493841457152",
}),
("https://www.pillowfort.social/posts/1672518", {
"options": (("inline", True),),
"count": 3,
}),
)
def posts(self):