2016-10-02 13:39:18 +02:00
# coding: utf-8
2014-03-12 04:20:47 +07:00
from __future__ import unicode_literals
import re
from . common import InfoExtractor
2018-01-13 23:28:08 +07:00
from . . compat import (
compat_str ,
compat_urlparse ,
)
2014-12-13 12:24:42 +01:00
from . . utils import (
2016-05-26 17:30:38 +02:00
determine_ext ,
2016-06-09 13:49:35 +08:00
ExtractorError ,
2016-05-26 19:08:12 +02:00
js_to_json ,
2016-05-26 16:59:45 +02:00
strip_jsonp ,
2018-01-13 23:28:08 +07:00
try_get ,
2014-03-30 07:25:42 +02:00
unified_strdate ,
2016-06-09 13:41:12 +08:00
update_url_query ,
2016-06-09 13:04:30 +08:00
urlhandle_detect_ext ,
2014-03-12 04:20:47 +07:00
)
2018-01-13 23:28:08 +07:00
class WDRIE ( InfoExtractor ) :
_VALID_URL = r ' https?://deviceids-medp \ .wdr \ .de/ondemand/ \ d+/(?P<id> \ d+) \ .js '
_TEST = {
' url ' : ' http://deviceids-medp.wdr.de/ondemand/155/1557833.js ' ,
' info_dict ' : {
' id ' : ' mdb-1140188 ' ,
' display_id ' : ' dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' DFB-Team geht gut gelaunt ins Spiel gegen Polen ' ,
' description ' : ' Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine. ' ,
' upload_date ' : ' 20160615 ' ,
} ,
' skip ' : ' Geo-restricted to Germany ' ,
}
2016-06-18 13:40:55 +08:00
2018-01-13 23:28:08 +07:00
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
2016-06-18 13:40:55 +08:00
metadata = self . _download_json (
2018-01-13 23:28:08 +07:00
url , video_id , transform_source = strip_jsonp )
2016-06-18 13:40:55 +08:00
2018-01-13 23:28:08 +07:00
is_live = metadata . get ( ' mediaType ' ) == ' live '
tracker_data = metadata [ ' trackerData ' ]
media_resource = metadata [ ' mediaResource ' ]
2016-06-18 13:40:55 +08:00
formats = [ ]
# check if the metadata contains a direct URL to a file
2018-01-13 23:28:08 +07:00
for kind , media_resource in media_resource . items ( ) :
2016-06-18 13:40:55 +08:00
if kind not in ( ' dflt ' , ' alt ' ) :
continue
for tag_name , medium_url in media_resource . items ( ) :
if tag_name not in ( ' videoURL ' , ' audioURL ' ) :
continue
ext = determine_ext ( medium_url )
if ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
2018-01-13 23:28:08 +07:00
medium_url , video_id , ' mp4 ' , ' m3u8_native ' ,
2016-06-18 13:40:55 +08:00
m3u8_id = ' hls ' ) )
elif ext == ' f4m ' :
manifest_url = update_url_query (
medium_url , { ' hdcore ' : ' 3.2.0 ' , ' plugin ' : ' aasp-3.2.0.77.18 ' } )
formats . extend ( self . _extract_f4m_formats (
2018-01-13 23:28:08 +07:00
manifest_url , video_id , f4m_id = ' hds ' , fatal = False ) )
2016-06-18 13:40:55 +08:00
elif ext == ' smil ' :
formats . extend ( self . _extract_smil_formats (
medium_url , ' stream ' , fatal = False ) )
else :
a_format = {
' url ' : medium_url
}
if ext == ' unknown_video ' :
urlh = self . _request_webpage (
2018-01-13 23:28:08 +07:00
medium_url , video_id , note = ' Determining extension ' )
2016-06-18 13:40:55 +08:00
ext = urlhandle_detect_ext ( urlh )
a_format [ ' ext ' ] = ext
formats . append ( a_format )
self . _sort_formats ( formats )
subtitles = { }
2018-01-13 23:28:08 +07:00
caption_url = media_resource . get ( ' captionURL ' )
2016-06-18 13:40:55 +08:00
if caption_url :
subtitles [ ' de ' ] = [ {
' url ' : caption_url ,
' ext ' : ' ttml ' ,
} ]
2018-01-13 23:28:08 +07:00
title = tracker_data [ ' trackerClipTitle ' ]
2016-06-18 13:40:55 +08:00
return {
2018-01-13 23:28:08 +07:00
' id ' : tracker_data . get ( ' trackerClipId ' , video_id ) ,
' title ' : self . _live_title ( title ) if is_live else title ,
' alt_title ' : tracker_data . get ( ' trackerClipSubcategory ' ) ,
2016-06-18 13:40:55 +08:00
' formats ' : formats ,
' subtitles ' : subtitles ,
2018-01-13 23:28:08 +07:00
' upload_date ' : unified_strdate ( tracker_data . get ( ' trackerClipAirTime ' ) ) ,
' is_live ' : is_live ,
2016-06-18 13:40:55 +08:00
}
2018-01-13 23:28:08 +07:00
class WDRPageIE ( InfoExtractor ) :
2016-05-26 19:58:55 +02:00
_CURRENT_MAUS_URL = r ' https?://(?:www \ .)wdrmaus.de/(?:[^/]+/) { 1,2}[^/?#]+ \ .php5 '
2018-01-13 23:28:08 +07:00
_PAGE_REGEX = r ' /(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+) \ .html '
_VALID_URL = r ' https?://(?:www \ d? \ .)?(?:wdr \ d?|sportschau) \ .de ' + _PAGE_REGEX + ' | ' + _CURRENT_MAUS_URL
2016-03-12 18:00:26 +01:00
2014-03-12 04:20:47 +07:00
_TESTS = [
{
2016-03-12 18:00:26 +01:00
' url ' : ' http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html ' ,
2016-06-08 00:25:51 +08:00
# HDS download, MD5 is unstable
2014-03-12 04:20:47 +07:00
' info_dict ' : {
2016-03-12 18:00:26 +01:00
' id ' : ' mdb-1058683 ' ,
2014-03-12 04:20:47 +07:00
' ext ' : ' flv ' ,
2016-03-12 18:00:26 +01:00
' display_id ' : ' doku-am-freitag/video-geheimnis-aachener-dom-100 ' ,
' title ' : ' Geheimnis Aachener Dom ' ,
' alt_title ' : ' Doku am Freitag ' ,
' upload_date ' : ' 20160304 ' ,
' description ' : ' md5:87be8ff14d8dfd7a7ee46f0299b52318 ' ,
' is_live ' : False ,
' subtitles ' : { ' de ' : [ {
2016-06-09 21:40:16 +08:00
' url ' : ' http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml ' ,
' ext ' : ' ttml ' ,
2016-03-12 18:00:26 +01:00
} ] } ,
2014-03-12 04:20:47 +07:00
} ,
2018-01-13 23:28:08 +07:00
' skip ' : ' HTTP Error 404: Not Found ' ,
2014-03-12 04:20:47 +07:00
} ,
{
2016-03-12 18:00:26 +01:00
' url ' : ' http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html ' ,
' md5 ' : ' f4c1f96d01cf285240f53ea4309663d8 ' ,
2014-03-12 04:20:47 +07:00
' info_dict ' : {
2016-03-12 18:00:26 +01:00
' id ' : ' mdb-1072000 ' ,
2014-03-12 04:20:47 +07:00
' ext ' : ' mp3 ' ,
2016-03-12 18:00:26 +01:00
' display_id ' : ' wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100 ' ,
' title ' : ' Schriftstellerin Juli Zeh ' ,
' alt_title ' : ' WDR 3 Gespräch am Samstag ' ,
' upload_date ' : ' 20160312 ' ,
' description ' : ' md5:e127d320bc2b1f149be697ce044a3dd7 ' ,
' is_live ' : False ,
' subtitles ' : { }
2014-03-12 04:20:47 +07:00
} ,
2018-01-13 23:28:08 +07:00
' skip ' : ' HTTP Error 404: Not Found ' ,
2014-03-12 04:20:47 +07:00
} ,
2015-01-09 21:33:07 +01:00
{
2016-03-12 18:00:26 +01:00
' url ' : ' http://www1.wdr.de/mediathek/video/live/index.html ' ,
2015-02-01 15:27:16 +01:00
' info_dict ' : {
2018-01-13 23:28:08 +07:00
' id ' : ' mdb-1406149 ' ,
2016-05-26 17:30:38 +02:00
' ext ' : ' mp4 ' ,
2018-01-13 23:28:08 +07:00
' title ' : r ' re:^WDR Fernsehen im Livestream \ (nur in Deutschland erreichbar \ ) [0-9] {4} -[0-9] {2} -[0-9] {2} [0-9] {2} :[0-9] {2} $ ' ,
2016-03-12 18:00:26 +01:00
' alt_title ' : ' WDR Fernsehen Live ' ,
2018-01-13 23:28:08 +07:00
' upload_date ' : ' 20150101 ' ,
2016-03-12 18:00:26 +01:00
' is_live ' : True ,
2016-05-26 17:30:38 +02:00
} ,
' params ' : {
' skip_download ' : True , # m3u8 download
} ,
2015-02-24 21:23:59 +02:00
} ,
{
2016-03-12 18:00:26 +01:00
' url ' : ' http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html ' ,
2018-01-13 23:28:08 +07:00
' playlist_mincount ' : 7 ,
2015-02-24 21:23:59 +02:00
' info_dict ' : {
2018-01-13 23:28:08 +07:00
' id ' : ' aktuelle-stunde-120 ' ,
2015-02-24 21:23:59 +02:00
} ,
2016-03-12 20:14:46 +01:00
} ,
{
' url ' : ' http://www.wdrmaus.de/aktuelle-sendung/index.php5 ' ,
' info_dict ' : {
2018-01-13 23:28:08 +07:00
' id ' : ' mdb-1552552 ' ,
2017-03-10 16:59:32 +01:00
' ext ' : ' mp4 ' ,
2016-03-12 20:14:46 +01:00
' upload_date ' : ' re:^[0-9] {8} $ ' ,
' title ' : ' re:^Die Sendung mit der Maus vom [0-9.] {10} $ ' ,
} ,
' skip ' : ' The id changes from week to week because of the new episode '
} ,
2016-05-26 19:58:55 +02:00
{
2017-03-10 16:59:32 +01:00
' url ' : ' http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5 ' ,
2016-06-18 13:40:55 +08:00
' md5 ' : ' 803138901f6368ee497b4d195bb164f2 ' ,
2016-05-26 19:58:55 +02:00
' info_dict ' : {
' id ' : ' mdb-186083 ' ,
2016-06-18 13:40:55 +08:00
' ext ' : ' mp4 ' ,
2016-05-26 19:58:55 +02:00
' upload_date ' : ' 20130919 ' ,
' title ' : ' Sachgeschichte - Achterbahn ' ,
} ,
} ,
2016-06-09 13:04:30 +08:00
{
' url ' : ' http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html ' ,
2016-06-09 13:41:12 +08:00
# Live stream, MD5 unstable
2016-06-09 13:04:30 +08:00
' info_dict ' : {
' id ' : ' mdb-869971 ' ,
2018-01-13 23:28:08 +07:00
' ext ' : ' mp4 ' ,
' title ' : r ' re:^COSMO Livestream [0-9] {4} -[0-9] {2} -[0-9] {2} [0-9] {2} :[0-9] {2} $ ' ,
2016-06-09 13:04:30 +08:00
' upload_date ' : ' 20160101 ' ,
} ,
2018-01-13 23:28:08 +07:00
' params ' : {
' skip_download ' : True , # m3u8 download
}
} ,
{
' url ' : ' http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html ' ,
' info_dict ' : {
' id ' : ' mdb-1556012 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' DHB-Vizepräsident Bob Hanning - " Die Weltspitze ist extrem breit " ' ,
' upload_date ' : ' 20180111 ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
{
' url ' : ' http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html ' ,
' only_matching ' : True ,
2016-06-09 13:04:30 +08:00
}
2014-03-12 04:20:47 +07:00
]
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
2016-03-12 18:00:26 +01:00
display_id = mobj . group ( ' display_id ' )
webpage = self . _download_webpage ( url , display_id )
2014-03-12 04:20:47 +07:00
2018-01-13 23:28:08 +07:00
entries = [ ]
2015-02-24 21:23:59 +02:00
2018-01-13 23:28:08 +07:00
# Article with several videos
2014-03-12 04:20:47 +07:00
2018-01-13 23:28:08 +07:00
# for wdr.de the data-extension is in a tag with the class "mediaLink"
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
# for wdrmaus, in a tag with the class "videoButton" (previously a link
# to the page in a multiline "videoLink"-tag)
for mobj in re . finditer (
r ''' (?sx)class=
( ? :
( [ " \' ])(?:mediaLink|wdrrPlayerPlayBtn|videoButton) \b .*? \1 [^>]+|
( [ " \' ])videoLink \b .*? \2 [ \ s]*> \n [^ \n ]*
) data - extension = ( [ " \' ])(?P<data>(?:(?! \3 ).)+) \3
''' , webpage):
media_link_obj = self . _parse_json (
mobj . group ( ' data ' ) , display_id , transform_source = js_to_json ,
fatal = False )
if not media_link_obj :
continue
jsonp_url = try_get (
media_link_obj , lambda x : x [ ' mediaObj ' ] [ ' url ' ] , compat_str )
if jsonp_url :
entries . append ( self . url_result ( jsonp_url , ie = WDRIE . ie_key ( ) ) )
2016-06-18 13:40:55 +08:00
2018-01-13 23:28:08 +07:00
# Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html)
if not entries :
entries = [
self . url_result (
compat_urlparse . urljoin ( url , mobj . group ( ' href ' ) ) ,
ie = WDRPageIE . ie_key ( ) )
for mobj in re . finditer (
r ' <a[^>]+ \ bhref=([ " \' ])(?P<href>(?:(?! \ 1).)+) \ 1[^>]+ \ bdata-extension= ' ,
webpage ) if re . match ( self . _PAGE_REGEX , mobj . group ( ' href ' ) )
]
2014-03-30 07:25:42 +02:00
2018-01-13 23:28:08 +07:00
return self . playlist_result ( entries , playlist_id = display_id )
2014-03-30 07:25:42 +02:00
2017-10-25 14:59:57 +02:00
2018-01-13 23:28:08 +07:00
class WDRElefantIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:www \ .)wdrmaus \ .de/elefantenseite/#(?P<id>.+) '
_TEST = {
' url ' : ' http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015 ' ,
' info_dict ' : {
' title ' : ' Folge Oster-Spezial 2015 ' ,
' id ' : ' mdb-1088195 ' ,
' ext ' : ' mp4 ' ,
' age_limit ' : None ,
' upload_date ' : ' 20150406 '
2017-10-25 14:59:57 +02:00
} ,
2018-01-13 23:28:08 +07:00
' params ' : {
' skip_download ' : True ,
} ,
}
2017-10-25 14:59:57 +02:00
def _real_extract ( self , url ) :
2018-01-13 23:28:08 +07:00
display_id = self . _match_id ( url )
2017-10-25 14:59:57 +02:00
# Table of Contents seems to always be at this address, so fetch it directly.
# The website fetches configurationJS.php5, which links to tableOfContentsJS.php5.
table_of_contents = self . _download_json (
2018-01-13 23:28:08 +07:00
' https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5 ' ,
display_id )
2017-10-25 14:59:57 +02:00
if display_id not in table_of_contents :
raise ExtractorError (
' No entry in site \' s table of contents for this URL. '
' Is the fragment part of the URL (after the #) correct? ' ,
expected = True )
xml_metadata_path = table_of_contents [ display_id ] [ ' xmlPath ' ]
xml_metadata = self . _download_xml (
2018-01-13 23:28:08 +07:00
' https://www.wdrmaus.de/elefantenseite/ ' + xml_metadata_path ,
display_id )
2017-10-25 14:59:57 +02:00
zmdb_url_element = xml_metadata . find ( ' ./movie/zmdb_url ' )
if zmdb_url_element is None :
raise ExtractorError (
2018-01-13 23:28:08 +07:00
' %s is not a video ' % display_id , expected = True )
return self . url_result ( zmdb_url_element . text , ie = WDRIE . ie_key ( ) )
2017-10-25 14:59:57 +02:00
2014-05-12 22:17:19 +02:00
class WDRMobileIE ( InfoExtractor ) :
_VALID_URL = r ''' (?x)
https ? : / / mobile - ondemand \. wdr \. de /
. * ? / fsk ( ? P < age_limit > [ 0 - 9 ] + )
/ [ 0 - 9 ] + / [ 0 - 9 ] + /
( ? P < id > [ 0 - 9 ] + ) _ ( ? P < title > [ 0 - 9 ] + ) '''
IE_NAME = ' wdr:mobile '
_TEST = {
' url ' : ' http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4 ' ,
' info_dict ' : {
' title ' : ' 4283021 ' ,
' id ' : ' 421735 ' ,
2014-07-01 09:59:57 +02:00
' ext ' : ' mp4 ' ,
2014-05-12 22:17:19 +02:00
' age_limit ' : 0 ,
} ,
2014-07-01 09:59:57 +02:00
' skip ' : ' Problems with loading data. '
2014-05-12 22:17:19 +02:00
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
return {
' id ' : mobj . group ( ' id ' ) ,
' title ' : mobj . group ( ' title ' ) ,
' age_limit ' : int ( mobj . group ( ' age_limit ' ) ) ,
' url ' : url ,
2015-01-24 18:19:58 +01:00
' http_headers ' : {
' User-Agent ' : ' mobile ' ,
} ,
2014-05-12 22:17:19 +02:00
}