generic extractor (#735)
* Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor.
This commit is contained in:
parent
4376b39a2b
commit
96fcff182c
@ -20,7 +20,7 @@ class _2chanThreadExtractor(Extractor):
|
|||||||
filename_fmt = "{tim}.{extension}"
|
filename_fmt = "{tim}.{extension}"
|
||||||
archive_fmt = "{board}_{thread}_{tim}"
|
archive_fmt = "{board}_{thread}_{tim}"
|
||||||
url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
|
url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
|
||||||
pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
|
pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/]+)/res/(\d+)"
|
||||||
test = ("http://dec.2chan.net/70/res/4752.htm", {
|
test = ("http://dec.2chan.net/70/res/4752.htm", {
|
||||||
"url": "f49aa31340e9a3429226af24e19e01f5b819ca1f",
|
"url": "f49aa31340e9a3429226af24e19e01f5b819ca1f",
|
||||||
"keyword": "44599c21b248e79692b2eb2da12699bd0ed5640a",
|
"keyword": "44599c21b248e79692b2eb2da12699bd0ed5640a",
|
||||||
|
@ -152,6 +152,7 @@ modules = [
|
|||||||
"oauth",
|
"oauth",
|
||||||
"test",
|
"test",
|
||||||
"ytdl",
|
"ytdl",
|
||||||
|
"generic",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ import re
|
|||||||
|
|
||||||
BASE_PATTERN = (
|
BASE_PATTERN = (
|
||||||
r"(?:blogger:(?:https?://)?([^/]+)|"
|
r"(?:blogger:(?:https?://)?([^/]+)|"
|
||||||
r"(?:https?://)?([^.]+\.blogspot\.com))")
|
r"(?:https?://)?([\w-]+\.blogspot\.com))")
|
||||||
|
|
||||||
|
|
||||||
class BloggerExtractor(Extractor):
|
class BloggerExtractor(Extractor):
|
||||||
|
@ -56,7 +56,7 @@ class FlickrImageExtractor(FlickrExtractor):
|
|||||||
subcategory = "image"
|
subcategory = "image"
|
||||||
pattern = (r"(?:https?://)?(?:"
|
pattern = (r"(?:https?://)?(?:"
|
||||||
r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
|
r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
|
||||||
r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
|
r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
|
||||||
r"|flic\.kr/p/([A-Za-z1-9]+))")
|
r"|flic\.kr/p/([A-Za-z1-9]+))")
|
||||||
test = (
|
test = (
|
||||||
("https://www.flickr.com/photos/departingyyz/16089302239", {
|
("https://www.flickr.com/photos/departingyyz/16089302239", {
|
||||||
|
208
gallery_dl/extractor/generic.py
Normal file
208
gallery_dl/extractor/generic.py
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""Extractor for images in a generic web page."""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import config, text
|
||||||
|
import re
|
||||||
|
import os.path
|
||||||
|
|
||||||
|
|
||||||
|
class GenericExtractor(Extractor):
|
||||||
|
"""Extractor for images in a generic web page."""
|
||||||
|
|
||||||
|
category = "generic"
|
||||||
|
directory_fmt = ("{category}", "{pageurl}")
|
||||||
|
archive_fmt = "{imageurl}"
|
||||||
|
|
||||||
|
# By default, the generic extractor is disabled
|
||||||
|
# and the "g(eneric):" prefix in url is required.
|
||||||
|
# If the extractor is enabled, make the prefix optional
|
||||||
|
pattern = r"(?ix)(?P<generic>g(?:eneric)?:)"
|
||||||
|
if config.get(("extractor", "generic"), "enabled"):
|
||||||
|
pattern += r"?"
|
||||||
|
|
||||||
|
# The generic extractor pattern should match (almost) any valid url
|
||||||
|
# Based on: https://tools.ietf.org/html/rfc3986#appendix-B
|
||||||
|
pattern += r"""
|
||||||
|
(?P<scheme>https?://)? # optional http(s) scheme
|
||||||
|
(?P<domain>[-\w\.]+) # required domain
|
||||||
|
(?P<path>/[^?&#]*)? # optional path
|
||||||
|
(?:\?(?P<query>[^/?#]*))? # optional query
|
||||||
|
(?:\#(?P<fragment>.*))?$ # optional fragment
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
"""Init."""
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
|
||||||
|
# Strip the "g(eneric):" prefix
|
||||||
|
# and inform about "forced" or "fallback" mode
|
||||||
|
if match.group('generic'):
|
||||||
|
self.log.info("Forcing use of generic information extractor.")
|
||||||
|
self.url = match.group(0).partition(":")[2]
|
||||||
|
else:
|
||||||
|
self.log.info("Falling back on generic information extractor.")
|
||||||
|
self.url = match.group(0)
|
||||||
|
|
||||||
|
# Make sure we have a scheme, or use https
|
||||||
|
if match.group('scheme'):
|
||||||
|
self.scheme = match.group('scheme')
|
||||||
|
else:
|
||||||
|
self.scheme = 'https://'
|
||||||
|
self.url = self.scheme + self.url
|
||||||
|
|
||||||
|
# Used to resolve relative image urls
|
||||||
|
self.root = self.scheme + match.group('domain')
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
"""Get page, extract metadata & images, yield them in suitable messages.
|
||||||
|
|
||||||
|
Adapted from common.GalleryExtractor.items()
|
||||||
|
|
||||||
|
"""
|
||||||
|
page = self.request(self.url).text
|
||||||
|
data = self.metadata(page)
|
||||||
|
imgs = self.images(page)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data["count"] = len(imgs)
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
|
images = enumerate(imgs, 1)
|
||||||
|
|
||||||
|
yield Message.Version, 1
|
||||||
|
yield Message.Directory, data
|
||||||
|
|
||||||
|
for data["num"], (url, imgdata) in images:
|
||||||
|
if imgdata:
|
||||||
|
data.update(imgdata)
|
||||||
|
if "extension" not in imgdata:
|
||||||
|
text.nameext_from_url(url, data)
|
||||||
|
else:
|
||||||
|
text.nameext_from_url(url, data)
|
||||||
|
yield Message.Url, url, data
|
||||||
|
|
||||||
|
def metadata(self, page):
|
||||||
|
"""Extract generic webpage metadata, return them in a dict."""
|
||||||
|
data = {}
|
||||||
|
data['pageurl'] = self.url
|
||||||
|
data['title'] = text.extract(page, '<title>', "</title>")[0] or ""
|
||||||
|
data['description'] = text.extract(
|
||||||
|
page, '<meta name="description" content="', '"')[0] or ""
|
||||||
|
data['keywords'] = text.extract(
|
||||||
|
page, '<meta name="keywords" content="', '"')[0] or ""
|
||||||
|
data['language'] = text.extract(
|
||||||
|
page, '<meta name="language" content="', '"')[0] or ""
|
||||||
|
data['name'] = text.extract(
|
||||||
|
page, '<meta itemprop="name" content="', '"')[0] or ""
|
||||||
|
data['copyright'] = text.extract(
|
||||||
|
page, '<meta name="copyright" content="', '"')[0] or ""
|
||||||
|
data['og_site'] = text.extract(
|
||||||
|
page, '<meta property="og:site" content="', '"')[0] or ""
|
||||||
|
data['og_site_name'] = text.extract(
|
||||||
|
page, '<meta property="og:site_name" content="', '"')[0] or ""
|
||||||
|
data['og_title'] = text.extract(
|
||||||
|
page, '<meta property="og:title" content="', '"')[0] or ""
|
||||||
|
data['og_description'] = text.extract(
|
||||||
|
page, '<meta property="og:description" content="', '"')[0] or ""
|
||||||
|
|
||||||
|
data = {k: text.unescape(data[k]) for k in data if data[k] != ""}
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def images(self, page):
|
||||||
|
"""Extract image urls, return a list of (image url, metadata) tuples.
|
||||||
|
|
||||||
|
The extractor aims at finding as many _likely_ image urls as possible,
|
||||||
|
using two strategies (see below); since these often overlap, any
|
||||||
|
duplicate urls will be removed at the end of the process.
|
||||||
|
|
||||||
|
Note: since we are using re.findall() (see below), it's essential that
|
||||||
|
the following patterns contain 0 or at most 1 capturing group, so that
|
||||||
|
re.findall() return a list of urls (instead of a list of tuples of
|
||||||
|
matching groups). All other groups used in the pattern should be
|
||||||
|
non-capturing (?:...).
|
||||||
|
|
||||||
|
1: Look in src/srcset attributes of img/video/source elements
|
||||||
|
|
||||||
|
See:
|
||||||
|
https://www.w3schools.com/tags/att_src.asp
|
||||||
|
https://www.w3schools.com/tags/att_source_srcset.asp
|
||||||
|
|
||||||
|
We allow both absolute and relative urls here.
|
||||||
|
|
||||||
|
Note that srcset attributes often contain multiple space separated
|
||||||
|
image urls; this pattern matches only the first url; remaining urls
|
||||||
|
will be matched by the "imageurl_pattern_ext" pattern below.
|
||||||
|
"""
|
||||||
|
imageurl_pattern_src = r"""(?ix)
|
||||||
|
<(?:img|video|source)\s.*? # <img>, <video> or <source>
|
||||||
|
src(?:set)?=["']? # src or srcset attributes
|
||||||
|
(?P<URL>[^"'\s>]+) # url
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
2: Look anywhere for urls containing common image/video extensions
|
||||||
|
|
||||||
|
The list of allowed extensions is borrowed from the directlink.py
|
||||||
|
extractor; other could be added, see
|
||||||
|
https://en.wikipedia.org/wiki/List_of_file_formats
|
||||||
|
|
||||||
|
Compared to the "pattern" class variable, here we must exclude also
|
||||||
|
other special characters (space, ", ', >), since we are looking for
|
||||||
|
urls in html tags.
|
||||||
|
"""
|
||||||
|
|
||||||
|
imageurl_pattern_ext = r"""(?ix)
|
||||||
|
(?:[^?&#"'>\s]+) # anything until dot+extension
|
||||||
|
\.(?:jpe?g|jpe|png|gif
|
||||||
|
|web[mp]|mp4|mkv|og[gmv]|opus) # dot + image/video extensions
|
||||||
|
(?:[^"'>\s]*)? # optional query and fragment
|
||||||
|
"""
|
||||||
|
|
||||||
|
imageurls_src = re.findall(imageurl_pattern_src, page)
|
||||||
|
imageurls_ext = re.findall(imageurl_pattern_ext, page)
|
||||||
|
imageurls = imageurls_src + imageurls_ext
|
||||||
|
|
||||||
|
# Resolve relative urls
|
||||||
|
#
|
||||||
|
# Image urls catched so far may be relative, so we must resolve them
|
||||||
|
# by prepending a suitable base url.
|
||||||
|
#
|
||||||
|
# If the page contains a <base> element, use it as base url
|
||||||
|
basematch = re.search(
|
||||||
|
r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)", page)
|
||||||
|
if basematch:
|
||||||
|
self.baseurl = basematch.group('url').rstrip('/')
|
||||||
|
# Otherwise, extract the base url from self.url
|
||||||
|
else:
|
||||||
|
if self.url.endswith("/"):
|
||||||
|
self.baseurl = self.url.rstrip('/')
|
||||||
|
else:
|
||||||
|
self.baseurl = os.path.dirname(self.url)
|
||||||
|
|
||||||
|
# Build the list of absolute image urls
|
||||||
|
absimageurls = []
|
||||||
|
for u in imageurls:
|
||||||
|
# Absolute urls are taken as-is
|
||||||
|
if u.startswith('http'):
|
||||||
|
absimageurls.append(u)
|
||||||
|
# // relative urls are prefixed with current scheme
|
||||||
|
elif u.startswith('//'):
|
||||||
|
absimageurls.append(self.scheme + u.lstrip('/'))
|
||||||
|
# / relative urls are prefixed with current scheme+domain
|
||||||
|
elif u.startswith('/'):
|
||||||
|
absimageurls.append(self.root + u)
|
||||||
|
# other relative urls are prefixed with baseurl
|
||||||
|
else:
|
||||||
|
absimageurls.append(self.baseurl + '/' + u)
|
||||||
|
|
||||||
|
# Remove duplicates
|
||||||
|
absimageurls = set(absimageurls)
|
||||||
|
|
||||||
|
# Create the image metadata dict and add imageurl to it
|
||||||
|
# (image filename and extension are added by items())
|
||||||
|
images = [(u, {'imageurl': u}) for u in absimageurls]
|
||||||
|
|
||||||
|
return images
|
@ -169,7 +169,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
|
|||||||
class ImgbbUserExtractor(ImgbbExtractor):
|
class ImgbbUserExtractor(ImgbbExtractor):
|
||||||
"""Extractor for user profiles in imgbb.com"""
|
"""Extractor for user profiles in imgbb.com"""
|
||||||
subcategory = "user"
|
subcategory = "user"
|
||||||
pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$"
|
pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$"
|
||||||
test = ("https://folkie.imgbb.com", {
|
test = ("https://folkie.imgbb.com", {
|
||||||
"range": "1-80",
|
"range": "1-80",
|
||||||
"pattern": r"https?://i\.ibb\.co/\w+/[^/?#]+",
|
"pattern": r"https?://i\.ibb\.co/\w+/[^/?#]+",
|
||||||
|
@ -19,7 +19,7 @@ class KeenspotComicExtractor(Extractor):
|
|||||||
directory_fmt = ("{category}", "{comic}")
|
directory_fmt = ("{category}", "{comic}")
|
||||||
filename_fmt = "{filename}.{extension}"
|
filename_fmt = "{filename}.{extension}"
|
||||||
archive_fmt = "{comic}_{filename}"
|
archive_fmt = "{comic}_{filename}"
|
||||||
pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?"
|
pattern = r"(?:https?://)?(?!www\.|forums\.)([\w-]+)\.keenspot\.com(/.+)?"
|
||||||
test = (
|
test = (
|
||||||
("http://marksmen.keenspot.com/", { # link
|
("http://marksmen.keenspot.com/", { # link
|
||||||
"range": "1-3",
|
"range": "1-3",
|
||||||
|
@ -20,8 +20,8 @@ class MyportfolioGalleryExtractor(Extractor):
|
|||||||
filename_fmt = "{num:>02}.{extension}"
|
filename_fmt = "{num:>02}.{extension}"
|
||||||
archive_fmt = "{user}_{filename}"
|
archive_fmt = "{user}_{filename}"
|
||||||
pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
|
pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
|
||||||
r"(?:https?://)?([^.]+\.myportfolio\.com))"
|
r"(?:https?://)?([\w-]+\.myportfolio\.com))"
|
||||||
r"(/[^/?#]+)?")
|
r"(/[^/?&#]+)?")
|
||||||
test = (
|
test = (
|
||||||
("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", {
|
("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", {
|
||||||
"url": "acea0690c76db0e5cf267648cefd86e921bc3499",
|
"url": "acea0690c76db0e5cf267648cefd86e921bc3499",
|
||||||
|
@ -420,7 +420,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
|
|||||||
"""Extractor for posts favorited by a newgrounds user"""
|
"""Extractor for posts favorited by a newgrounds user"""
|
||||||
subcategory = "favorite"
|
subcategory = "favorite"
|
||||||
directory_fmt = ("{category}", "{user}", "Favorites")
|
directory_fmt = ("{category}", "{user}", "Favorites")
|
||||||
pattern = (r"(?:https?://)?([^.]+)\.newgrounds\.com"
|
pattern = (r"(?:https?://)?([\w-]+)\.newgrounds\.com"
|
||||||
r"/favorites(?!/following)(?:/(art|audio|movies))?/?")
|
r"/favorites(?!/following)(?:/(art|audio|movies))?/?")
|
||||||
test = (
|
test = (
|
||||||
("https://tomfulp.newgrounds.com/favorites/art", {
|
("https://tomfulp.newgrounds.com/favorites/art", {
|
||||||
@ -475,7 +475,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
|
|||||||
class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
|
class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
|
||||||
"""Extractor for a newgrounds user's favorited users"""
|
"""Extractor for a newgrounds user's favorited users"""
|
||||||
subcategory = "following"
|
subcategory = "following"
|
||||||
pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/favorites/(following)"
|
pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/favorites/(following)"
|
||||||
test = ("https://tomfulp.newgrounds.com/favorites/following", {
|
test = ("https://tomfulp.newgrounds.com/favorites/following", {
|
||||||
"pattern": NewgroundsUserExtractor.pattern,
|
"pattern": NewgroundsUserExtractor.pattern,
|
||||||
"range": "76-125",
|
"range": "76-125",
|
||||||
|
@ -21,8 +21,8 @@ class PhotobucketAlbumExtractor(Extractor):
|
|||||||
directory_fmt = ("{category}", "{username}", "{location}")
|
directory_fmt = ("{category}", "{username}", "{location}")
|
||||||
filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
|
filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
|
||||||
archive_fmt = "{id}"
|
archive_fmt = "{id}"
|
||||||
pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)"
|
pattern = (r"(?:https?://)?((?:[\w-]+\.)?photobucket\.com)"
|
||||||
r"/user/[^/?#]+/library(?:/[^?#]*)?")
|
r"/user/[^/?&#]+/library(?:/[^?&#]*)?")
|
||||||
test = (
|
test = (
|
||||||
("https://s369.photobucket.com/user/CrpyLrkr/library", {
|
("https://s369.photobucket.com/user/CrpyLrkr/library", {
|
||||||
"pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/",
|
"pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/",
|
||||||
@ -109,9 +109,9 @@ class PhotobucketImageExtractor(Extractor):
|
|||||||
directory_fmt = ("{category}", "{username}")
|
directory_fmt = ("{category}", "{username}")
|
||||||
filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
|
filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
|
||||||
archive_fmt = "{username}_{id}"
|
archive_fmt = "{username}_{id}"
|
||||||
pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com"
|
pattern = (r"(?:https?://)?(?:[\w-]+\.)?photobucket\.com"
|
||||||
r"(?:/gallery/user/([^/?#]+)/media/([^/?#]+)"
|
r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"
|
||||||
r"|/user/([^/?#]+)/media/[^?#]+\.html)")
|
r"|/user/([^/?&#]+)/media/[^?&#]+\.html)")
|
||||||
test = (
|
test = (
|
||||||
(("https://s271.photobucket.com/user/lakerfanryan"
|
(("https://s271.photobucket.com/user/lakerfanryan"
|
||||||
"/media/Untitled-3-1.jpg.html"), {
|
"/media/Untitled-3-1.jpg.html"), {
|
||||||
|
@ -12,7 +12,7 @@ from .common import Extractor, Message
|
|||||||
from .. import text, exception
|
from .. import text, exception
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net"
|
BASE_PATTERN = r"(?:https?://)?(?!www\.)([\w-]+)\.pixnet.net"
|
||||||
|
|
||||||
|
|
||||||
class PixnetExtractor(Extractor):
|
class PixnetExtractor(Extractor):
|
||||||
|
@ -12,7 +12,7 @@ from .common import Extractor, Message
|
|||||||
from .. import text, exception
|
from .. import text, exception
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com"
|
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com"
|
||||||
|
|
||||||
|
|
||||||
class PornhubExtractor(Extractor):
|
class PornhubExtractor(Extractor):
|
||||||
|
@ -13,7 +13,7 @@ from .. import text
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?([^.]+)\.slickpic\.com"
|
BASE_PATTERN = r"(?:https?://)?([\w-]+)\.slickpic\.com"
|
||||||
|
|
||||||
|
|
||||||
class SlickpicExtractor(Extractor):
|
class SlickpicExtractor(Extractor):
|
||||||
|
@ -13,7 +13,7 @@ from .. import text, oauth, exception
|
|||||||
|
|
||||||
BASE_PATTERN = (
|
BASE_PATTERN = (
|
||||||
r"(?:smugmug:(?!album:)(?:https?://)?([^/]+)|"
|
r"(?:smugmug:(?!album:)(?:https?://)?([^/]+)|"
|
||||||
r"(?:https?://)?([^.]+)\.smugmug\.com)")
|
r"(?:https?://)?([\w-]+)\.smugmug\.com)")
|
||||||
|
|
||||||
|
|
||||||
class SmugmugExtractor(Extractor):
|
class SmugmugExtractor(Extractor):
|
||||||
|
@ -35,7 +35,7 @@ POST_TYPES = frozenset((
|
|||||||
|
|
||||||
BASE_PATTERN = (
|
BASE_PATTERN = (
|
||||||
r"(?:tumblr:(?:https?://)?([^/]+)|"
|
r"(?:tumblr:(?:https?://)?([^/]+)|"
|
||||||
r"(?:https?://)?([^.]+\.tumblr\.com))")
|
r"(?:https?://)?([\w-]+\.tumblr\.com))")
|
||||||
|
|
||||||
|
|
||||||
class TumblrExtractor(Extractor):
|
class TumblrExtractor(Extractor):
|
||||||
|
@ -13,7 +13,7 @@ from .. import text
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = (r"(?:https?://)?((?:[^.]+\.)?xhamster"
|
BASE_PATTERN = (r"(?:https?://)?((?:[\w-]+\.)?xhamster"
|
||||||
r"(?:\d?\.(?:com|one|desi)|\.porncache\.net))")
|
r"(?:\d?\.(?:com|one|desi)|\.porncache\.net))")
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user