diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index ac363052..65bc7c3d 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -8,10 +8,8 @@ """Extract manga pages from http://bato.to/""" -from .common import AsynchronousExtractor -from .common import Message -from .common import filename_from_url, unescape -from urllib.parse import unquote +from .common import AsynchronousExtractor, Message +from .. import text import os.path import re @@ -44,13 +42,13 @@ class BatotoExtractor(AsynchronousExtractor): def get_page_metadata(self, page_url): """Collect next url and metadata for one manga-page""" page = self.request(page_url).text - _ , pos = self.extract(page, 'selected="selected"', '') - title, pos = self.extract(page, ': ', '<', pos) - _ , pos = self.extract(page, 'selected="selected"', '', pos) - trans, pos = self.extract(page, '>', '<', pos) - _ , pos = self.extract(page, '
', '<', pos) + _ , pos = text.extract(page, '
(.+) - (?:vol (\d+) )?" r"ch (\d+)[^ ]+ Page (\d+) | Batoto!", @@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor): r"(.+) - ([^ ]+)", trans ) - filename = unquote(filename_from_url(image)) + filename = text.unquote(text.filename_from_url(image)) name, ext = os.path.splitext(filename) return url, { "category": info["category"], "chapter-id": self.chapter_id, - "manga": unescape(mmatch.group(1)), + "manga": text.unescape(mmatch.group(1)), "volume": mmatch.group(2) or "", "chapter": mmatch.group(3), "page": mmatch.group(4), "group": tmatch.group(1), "language": tmatch.group(2), - "title": unescape(title), + "title": text.unescape(title), "image-url": image, "name": name, "extension": ext[1:], diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 88600397..f72bc789 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -8,15 +8,13 @@ """Base classes for extractors for danbooru and co""" -from .common import SequentialExtractor -from .common import Message -from .common import filename_from_url +from .common import SequentialExtractor, Message +from .. import text import xml.etree.ElementTree as ET import json import os.path import urllib.parse - class BooruExtractor(SequentialExtractor): api_url = "" @@ -24,7 +22,7 @@ class BooruExtractor(SequentialExtractor): def __init__(self, match, config, info): SequentialExtractor.__init__(self, config) self.info = info - self.tags = urllib.parse.unquote(match.group(1)) + self.tags = text.unquote(match.group(1)) self.page = "page" self.params = {"tags": self.tags} self.headers = {} @@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor): def get_file_metadata(self, data): """Collect metadata for a downloadable file""" data["category"] = self.info["category"] - data["name"] = urllib.parse.unquote( - filename_from_url(self.get_file_url(data)) + data["name"] = text.unquote( + text.filename_from_url(self.get_file_url(data)) ) data["extension"] = os.path.splitext(data["name"])[1][1:] return data diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py index 2f943068..2d2b6fb4 100644 --- a/gallery_dl/extractor/chan.py +++ b/gallery_dl/extractor/chan.py @@ -9,6 +9,7 @@ """Base classes for extractors for different Futaba Channel boards""" from .common import SequentialExtractor, Message +from .. import text import re class ChanExtractor(SequentialExtractor): @@ -44,5 +45,4 @@ class ChanExtractor(SequentialExtractor): """Return thread title from first post""" if "sub" in post: return post["sub"] - com = re.sub("<[^>]+?>", "", post["com"]) - return " ".join(com.split())[:50] + return text.remove_html(post["com"])[:50] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index cb8e91ca..b364d870 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -44,24 +44,6 @@ class Extractor(): "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0" ) - @staticmethod - def extract(txt, begin, end, pos=0): - try: - first = txt.index(begin, pos) + len(begin) - last = txt.index(end, first) - return txt[first:last], last+len(end) - except ValueError: - return None, pos - - @staticmethod - def extract_all(txt, begin, end, pos=0): - try: - first = txt.index(begin, pos) - last = txt.index(end, first + len(begin)) + len(end) - return txt[first:last], last - except ValueError: - return None, pos - class SequentialExtractor(Extractor): @@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs): # everything ok -- proceed to download return r - -def filename_from_url(url): - pos = url.rfind("/") - return url[pos+1:] - -unescape = html.parser.HTMLParser().unescape diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index f8886a7a..c89721f2 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -8,9 +8,8 @@ """Extract images from galleries at http://www.imagebam.com/""" -from .common import AsynchronousExtractor -from .common import Message -from .common import filename_from_url +from .common import AsynchronousExtractor, Message +from .. import text info = { "category": "imagebam", @@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor): done = False while not done: # get current page - text = self.request(self.url_base + next_url).text + page = self.request(self.url_base + next_url).text # get url for next page - next_url, pos = self.extract(text, "next image" we are done - if not text.startswith(">next image", pos): + if not page.startswith(">next image", pos): done = True # get image url - img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos) + img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos) yield Message.Url, img_url, self.get_file_metadata(img_url) def get_job_metadata(self): """Collect metadata for extractor-job""" gallery_key = self.match.group(2) - text = self.request(self.url_base + "/gallery/" + gallery_key).text - _ , pos = self.extract(text, " ", " <", pos) - count, pos = self.extract(text, "'>", " images", pos) - url , pos = self.extract(text, " ", " <", pos) + count, pos = text.extract(page, "'>", " images", pos) + url , pos = text.extract(page, "', page):
-            text = self.request(self.url_base + match.group(1)).text
-            yield Message.Url, self.get_file_url(text), self.get_file_metadata(text)
+            imgpage = self.request(self.url_base + match.group(1)).text
+            yield Message.Url, self.get_file_url(imgpage), self.get_file_metadata(imgpage)
 
     def get_job_metadata(self, page):
           ', ' of ') - data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos) - data["name"] , pos = self.extract(text, ' title="', '"', pos) + data["num"] , pos = text.extract(page, '   ', ' of ') + data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos) + data["name"] , pos = text.extract(page, ' title="', '"', pos) return data - def get_file_url(self, text): + def get_file_url(self, page): """Extract download-url""" base = "http://i.imgbox.com/" - path, _ = self.extract(text, base, '"') + path, _ = text.extract(page, base, '"') return base + path diff --git a/gallery_dl/extractor/imgchili.py b/gallery_dl/extractor/imgchili.py index 40932912..9e591e57 100644 --- a/gallery_dl/extractor/imgchili.py +++ b/gallery_dl/extractor/imgchili.py @@ -8,9 +8,8 @@ """Extract images from albums at http://imgchili.net/""" -from .common import SequentialExtractor -from .common import Message -from .common import filename_from_url +from .common import SequentialExtractor, Message +from .. import text import re info = { @@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor): def get_job_metadata(self, page): """Collect metadata for extractor-job""" - title = self.extract(page, "

", "

")[0] + title = text.extract(page, "

", "

")[0] return { "category": info["category"], "title": title, diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 62575308..60ed473a 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -8,10 +8,8 @@ """Extract manga pages from http://www.mangareader.net/""" -from .common import AsynchronousExtractor -from .common import Message -from .common import unescape, filename_from_url -from urllib.parse import unquote +from .common import AsynchronousExtractor, Message +from .. import text import os.path import re @@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor): def get_page_metadata(self, page_url): """Collect next url, image-url and metadata for one manga-page""" page = self.request(page_url).text - extr = self.extract + extr = text.extract width = None descr, pos = extr(page, '', '') - manga , pos = self.extract(page, 'title="', '"', pos) - chapter , pos = self.extract(page, '">', '', pos) - json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos) + _ , pos = text.extract(page, '

', '') + manga , pos = text.extract(page, 'title="', '"', pos) + chapter , pos = text.extract(page, '">', '', pos) + json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos) match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter) return { "category": info["category"], - "manga": unescape(manga), + "manga": text.unescape(manga), "chapter": match.group(2) or match.group(1), "chapter-minor": match.group(3) or "", "language": "English", - "title": unescape(match.group(4) or ""), + "title": text.unescape(match.group(4) or ""), }, json.loads(json_data)