[reddit] some small fixes
- filter or complete some URLs - remove the 'nofollow:' scheme before printing URLs - (#15)
This commit is contained in:
parent
a22892f494
commit
e425243b1e
@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015, 2016 Mike Fährmann
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@ -14,9 +14,9 @@ from .. import adapter
|
||||
|
||||
|
||||
class RecursiveExtractor(Extractor):
|
||||
|
||||
"""Extractor that fetches URLs from a remote or local source"""
|
||||
category = "recursive"
|
||||
pattern = ["r(?:ecursive)?:(.+)"]
|
||||
pattern = [r"r(?:ecursive)?:(.+)"]
|
||||
test = [("recursive:https://pastebin.com/raw/FLwrCYsT", {
|
||||
"url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
|
||||
})]
|
||||
|
@ -34,7 +34,11 @@ class RedditExtractor(Extractor):
|
||||
)
|
||||
)
|
||||
for url in urls:
|
||||
if regex.match(url):
|
||||
if url[0] == "#":
|
||||
continue
|
||||
elif url[0] == "/":
|
||||
url = "nofollow:https://www.reddit.com" + url
|
||||
elif regex.match(url):
|
||||
url = "nofollow:" + url
|
||||
yield Message.Queue, url
|
||||
|
||||
@ -61,7 +65,8 @@ class RedditSubmissionExtractor(RedditExtractor):
|
||||
"""Extractor for images from a submission on reddit.com"""
|
||||
subcategory = "subreddit"
|
||||
pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+"
|
||||
r"/comments/([^/]+)")]
|
||||
r"/comments/([a-z0-9]+)"),
|
||||
(r"(?:https?://)?redd\.it/([a-z0-9]+)")]
|
||||
|
||||
def __init__(self, match):
|
||||
RedditExtractor.__init__(self)
|
||||
|
@ -214,9 +214,10 @@ class UrlJob(Job):
|
||||
Job.__init__(self, url)
|
||||
self.depth = depth
|
||||
if depth == self.maxdepth:
|
||||
self.handle_queue = print
|
||||
self.handle_queue = self._print
|
||||
|
||||
def handle_url(self, url, _):
|
||||
@staticmethod
|
||||
def handle_url(url, _):
|
||||
print(url)
|
||||
|
||||
def handle_queue(self, url):
|
||||
@ -225,6 +226,12 @@ class UrlJob(Job):
|
||||
except exception.NoExtractorError:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _print(url):
|
||||
if url.startswith("nofollow:"):
|
||||
url = url[9:]
|
||||
print(url)
|
||||
|
||||
|
||||
class TestJob(DownloadJob):
|
||||
"""Generate test-results for extractor runs"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user