[reddit] some small fixes
- filter or complete some URLs - remove the 'nofollow:' scheme before printing URLs - (#15)
This commit is contained in:
parent
a22892f494
commit
e425243b1e
@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2015, 2016 Mike Fährmann
|
# Copyright 2015-2017 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
@ -14,9 +14,9 @@ from .. import adapter
|
|||||||
|
|
||||||
|
|
||||||
class RecursiveExtractor(Extractor):
|
class RecursiveExtractor(Extractor):
|
||||||
|
"""Extractor that fetches URLs from a remote or local source"""
|
||||||
category = "recursive"
|
category = "recursive"
|
||||||
pattern = ["r(?:ecursive)?:(.+)"]
|
pattern = [r"r(?:ecursive)?:(.+)"]
|
||||||
test = [("recursive:https://pastebin.com/raw/FLwrCYsT", {
|
test = [("recursive:https://pastebin.com/raw/FLwrCYsT", {
|
||||||
"url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
|
"url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
|
||||||
})]
|
})]
|
||||||
|
@ -34,7 +34,11 @@ class RedditExtractor(Extractor):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if regex.match(url):
|
if url[0] == "#":
|
||||||
|
continue
|
||||||
|
elif url[0] == "/":
|
||||||
|
url = "nofollow:https://www.reddit.com" + url
|
||||||
|
elif regex.match(url):
|
||||||
url = "nofollow:" + url
|
url = "nofollow:" + url
|
||||||
yield Message.Queue, url
|
yield Message.Queue, url
|
||||||
|
|
||||||
@ -61,7 +65,8 @@ class RedditSubmissionExtractor(RedditExtractor):
|
|||||||
"""Extractor for images from a submission on reddit.com"""
|
"""Extractor for images from a submission on reddit.com"""
|
||||||
subcategory = "subreddit"
|
subcategory = "subreddit"
|
||||||
pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+"
|
pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+"
|
||||||
r"/comments/([^/]+)")]
|
r"/comments/([a-z0-9]+)"),
|
||||||
|
(r"(?:https?://)?redd\.it/([a-z0-9]+)")]
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
RedditExtractor.__init__(self)
|
RedditExtractor.__init__(self)
|
||||||
|
@ -214,9 +214,10 @@ class UrlJob(Job):
|
|||||||
Job.__init__(self, url)
|
Job.__init__(self, url)
|
||||||
self.depth = depth
|
self.depth = depth
|
||||||
if depth == self.maxdepth:
|
if depth == self.maxdepth:
|
||||||
self.handle_queue = print
|
self.handle_queue = self._print
|
||||||
|
|
||||||
def handle_url(self, url, _):
|
@staticmethod
|
||||||
|
def handle_url(url, _):
|
||||||
print(url)
|
print(url)
|
||||||
|
|
||||||
def handle_queue(self, url):
|
def handle_queue(self, url):
|
||||||
@ -225,6 +226,12 @@ class UrlJob(Job):
|
|||||||
except exception.NoExtractorError:
|
except exception.NoExtractorError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _print(url):
|
||||||
|
if url.startswith("nofollow:"):
|
||||||
|
url = url[9:]
|
||||||
|
print(url)
|
||||||
|
|
||||||
|
|
||||||
class TestJob(DownloadJob):
|
class TestJob(DownloadJob):
|
||||||
"""Generate test-results for extractor runs"""
|
"""Generate test-results for extractor runs"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user