[twitter] improve

- update metadata structure
  - combine all user… entries into their own dict
  - let 'user' always specify the Timeline owner
  - add 'author' entry that specifies the original Tweet author
- create directories per post (closes #491)
- fix username issues with /i/web/ URLs
This commit is contained in:
Mike Fährmann 2019-11-30 21:51:08 +01:00
parent 26d2334550
commit 3bba763ab9
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -11,13 +11,14 @@
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache, memcache
import json
import re
class TwitterExtractor(Extractor):
"""Base class for twitter extractors"""
category = "twitter"
directory_fmt = ("{category}", "{user}")
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
root = "https://twitter.com"
@ -26,6 +27,7 @@ class TwitterExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1)
self._user_dict = None
self.logged_in = False
self.retweets = self.config("retweets", True)
self.content = self.config("content", False)
@ -37,23 +39,18 @@ class TwitterExtractor(Extractor):
def items(self):
self.login()
metadata = self.metadata()
yield Message.Version, 1
yield Message.Directory, self.metadata()
for tweet in self.tweets():
data = self._data_from_tweet(tweet)
if not self.retweets and data["retweet_id"]:
if not data or not self.retweets and data["retweet_id"]:
continue
images = text.extract_iter(
tweet, 'data-image-url="', '"')
for data["num"], url in enumerate(images, 1):
text.nameext_from_url(url, data)
urls = [url + size for size in self.sizes]
yield Message.Urllist, urls, data
data.update(metadata)
if self.videos and "-videoContainer" in tweet:
yield Message.Directory, data
if self.videos == "ytdl":
data["extension"] = None
url = "ytdl:{}/{}/status/{}".format(
@ -70,9 +67,19 @@ class TwitterExtractor(Extractor):
data["num"] = 1
yield Message.Url, url, data
elif "data-image-url=" in tweet:
yield Message.Directory, data
images = text.extract_iter(
tweet, 'data-image-url="', '"')
for data["num"], url in enumerate(images, 1):
text.nameext_from_url(url, data)
urls = [url + size for size in self.sizes]
yield Message.Urllist, urls, data
def metadata(self):
"""Return general metadata"""
return {"user": self.user}
return {}
def tweets(self):
"""Yield HTML content of all relevant tweets"""
@ -113,11 +120,33 @@ class TwitterExtractor(Extractor):
"tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
"retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
"retweeter" : extr('data-retweeter="' , '"'),
"user" : extr('data-screen-name="', '"'),
"username" : extr('data-name="' , '"'),
"user_id" : text.parse_int(extr('data-user-id="' , '"')),
"date" : text.parse_timestamp(extr('data-time="', '"')),
"author" : {
"name" : extr('data-screen-name="', '"'),
"nick" : text.unescape(extr('data-name="' , '"')),
"id" : text.parse_int(extr('data-user-id="' , '"')),
},
}
if not self._user_dict:
if data["retweet_id"]:
for user in json.loads(text.unescape(extr(
'data-reply-to-users-json="', '"'))):
if user["screen_name"] == data["retweeter"]:
break
else:
self.log.warning("Unable to extract user info")
return None
self._user_dict = {
"name": user["screen_name"],
"nick": text.unescape(user["name"]),
"id" : text.parse_int(user["id_str"]),
}
else:
self._user_dict = data["author"]
data["user"] = self._user_dict
data["date"] = text.parse_timestamp(extr('data-time="', '"'))
if self.content:
content = extr('<div class="js-tweet-text-container">', '\n</div>')
if '<img class="Emoji ' in content:
@ -125,6 +154,7 @@ class TwitterExtractor(Extractor):
content = text.unescape(text.remove_html(content, "", ""))
cl, _, cr = content.rpartition("pic.twitter.com/")
data["content"] = cl if cl and len(cr) < 16 else content
return data
def _video_from_tweet(self, tweet_id):
@ -204,7 +234,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
("https://twitter.com/supernaturepics", {
"range": "1-40",
"url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
"keyword": "7210d679606240405e0cf62cbc67596e81a7a250",
"keyword": "37f4d35affd733d458d3b235b4a55f619a86f794",
}),
("https://mobile.twitter.com/supernaturepics?p=i"),
)
@ -262,13 +292,13 @@ class TwitterTweetExtractor(TwitterExtractor):
test = (
("https://twitter.com/supernaturepics/status/604341487988576256", {
"url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
"keyword": "1b8afb93cc04a9f44d89173f8facc61c3a6caf91",
"keyword": "3fa3623e8d9a204597238e2f1f6433da19c63b4a",
"content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
}),
# 4 images
("https://twitter.com/perrypumas/status/894001459754180609", {
"url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
"keyword": "43d98ab448193f0d4f30aa571a4b6bda9b6a5692",
"keyword": "49165725116ac52193a3861e8f5534e47a706b62",
}),
# video
("https://twitter.com/perrypumas/status/1065692031626829824", {
@ -278,7 +308,7 @@ class TwitterTweetExtractor(TwitterExtractor):
# content with emoji, newlines, hashtags (#338)
("https://twitter.com/yumi_san0112/status/1151144618936823808", {
"options": (("content", True),),
"keyword": "4d85faca51841b563aef613171e5efa9490219d8",
"keyword": "0b7a3d05607b480c1412dfd85f8606478313e7bf",
}),
# Reply to another tweet (#403)
("https://twitter.com/tyson_hesse/status/1103767554424598528", {
@ -295,9 +325,6 @@ class TwitterTweetExtractor(TwitterExtractor):
TwitterExtractor.__init__(self, match)
self.tweet_id = match.group(2)
def metadata(self):
return {"user": self.user, "tweet_id": self.tweet_id}
def tweets(self):
url = "{}/i/web/status/{}".format(self.root, self.tweet_id)
cookies = {"app_shell_visited": "1"}