implement youtube-dl downloader module

URLs starting with 'ytdl:' will now be handled by youtube-dl.
There is probably a lot to fix and improve, but the basic use case
works.

TODO:
- format selection and ytdl options in general
- better filename/path handling
- ytdl support for "unsupported URLs"
- ...
This commit is contained in:
Mike Fährmann 2018-10-05 17:58:15 +02:00
parent f4df6c2396
commit 188876d814
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
3 changed files with 81 additions and 13 deletions

View File

@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-
# Copyright 2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Downloader module for URLs requiring youtube-dl support"""
from .common import DownloaderBase
from youtube_dl import YoutubeDL
import logging
import os
class Downloader(DownloaderBase):
scheme = "ytdl"
def __init__(self, session, output):
DownloaderBase.__init__(self, session, output)
self.ytdl = YoutubeDL({
"logger": logging.getLogger("ytdl"),
})
def download(self, url, pathfmt):
try:
info_dict = self.ytdl.extract_info(url[5:], download=False)
except Exception:
return False
if "entries" in info_dict:
return self._download_playlist(pathfmt, info_dict)
return self._download_video(pathfmt, info_dict)
def _download_video(self, pathfmt, info_dict):
pathfmt.set_extension(info_dict["ext"])
if pathfmt.exists():
pathfmt.temppath = ""
return True
if self.partdir:
pathfmt.temppath = os.path.join(
self.partdir, pathfmt.filename)
self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%$")
self.out.start(pathfmt.path)
try:
self.ytdl.process_info(info_dict)
except Exception:
return False
return True
def _download_playlist(self, pathfmt, info_dict):
pathfmt.set_extension("%(playlist_index)s.%(ext)s")
self.ytdl.params["outtmpl"] = pathfmt.realpath
for entry in info_dict["entries"]:
self.ytdl.process_info(entry)
return True

View File

@ -9,7 +9,7 @@
"""Extract images from https://twitter.com/"""
from .common import Extractor, Message
from .. import text, extractor
from .. import text
class TwitterExtractor(Extractor):
@ -26,9 +26,6 @@ class TwitterExtractor(Extractor):
self.retweets = self.config("retweets", True)
self.videos = self.config("videos", False)
if self.videos:
self._blacklist = extractor.blacklist(("twitter",))
def items(self):
yield Message.Version, 1
yield Message.Directory, self.metadata()
@ -45,10 +42,10 @@ class TwitterExtractor(Extractor):
yield Message.Url, url + ":orig", data
if self.videos and "-videoContainer" in tweet:
url = "{}/{}/status/{}".format(
data["num"] = 1
url = "ytdl:{}/{}/status/{}".format(
self.root, data["user"], data["tweet_id"])
with self._blacklist:
yield Message.Queue, url, data
yield Message.Url, url, data
def metadata(self):
"""Return general metadata"""

View File

@ -179,17 +179,17 @@ class DownloadJob(Job):
time.sleep(self.sleep)
# download from URL
if not self.get_downloader(url).download(url, self.pathfmt):
if not self.download(url):
# use fallback URLs if available
for num, url in enumerate(fallback or (), 1):
self.log.info("Trying fallback URL #%d", num)
if self.get_downloader(url).download(url, self.pathfmt):
if self.download(url):
break
else:
# download failed
self.log.error(
"Failed to download %s", self.pathfmt.filename)
"Failed to download %s", self.pathfmt.filename or url)
return
if not self.pathfmt.temppath:
@ -230,17 +230,29 @@ class DownloadJob(Job):
for pp in self.postprocessors:
pp.finalize()
def get_downloader(self, url):
"""Return, and possibly construct, a downloader suitable for 'url'"""
def download(self, url):
"""Download 'url'"""
scheme = url.partition(":")[0]
downloader = self.get_downloader(scheme)
if downloader:
return downloader.download(url, self.pathfmt)
return False
def get_downloader(self, scheme):
"""Return a downloader suitable for 'scheme'"""
if scheme == "https":
scheme = "http"
try:
return self.downloaders[scheme]
except KeyError:
pass
klass = downloader.find(scheme)
instance = klass(self.extractor.session, self.out)
if klass:
instance = klass(self.extractor.session, self.out)
else:
instance = None
self.log.error("'%s:' URLs are not supported", scheme)
self.downloaders[scheme] = instance
return instance