rewrite extractors to use text-module

This commit is contained in:
Mike Fährmann 2015-10-03 15:43:02 +02:00
parent 2962bf36f6
commit 42b8e81a68
11 changed files with 66 additions and 98 deletions

View File

@ -8,10 +8,8 @@
"""Extract manga pages from http://bato.to/""" """Extract manga pages from http://bato.to/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url, unescape
from urllib.parse import unquote
import os.path import os.path
import re import re
@ -44,13 +42,13 @@ class BatotoExtractor(AsynchronousExtractor):
def get_page_metadata(self, page_url): def get_page_metadata(self, page_url):
"""Collect next url and metadata for one manga-page""" """Collect next url and metadata for one manga-page"""
page = self.request(page_url).text page = self.request(page_url).text
_ , pos = self.extract(page, 'selected="selected"', '') _ , pos = text.extract(page, 'selected="selected"', '')
title, pos = self.extract(page, ': ', '<', pos) title, pos = text.extract(page, ': ', '<', pos)
_ , pos = self.extract(page, 'selected="selected"', '', pos) _ , pos = text.extract(page, 'selected="selected"', '', pos)
trans, pos = self.extract(page, '>', '<', pos) trans, pos = text.extract(page, '>', '<', pos)
_ , pos = self.extract(page, '<div id="full_image"', '', pos) _ , pos = text.extract(page, '<div id="full_image"', '', pos)
image, pos = self.extract(page, '<img src="', '"', pos) image, pos = text.extract(page, '<img src="', '"', pos)
url , pos = self.extract(page, '<a href="', '"', pos) url , pos = text.extract(page, '<a href="', '"', pos)
mmatch = re.search( mmatch = re.search(
r"<title>(.+) - (?:vol (\d+) )?" r"<title>(.+) - (?:vol (\d+) )?"
r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>", r"ch (\d+)[^ ]+ Page (\d+) | Batoto!</title>",
@ -60,18 +58,18 @@ class BatotoExtractor(AsynchronousExtractor):
r"(.+) - ([^ ]+)", r"(.+) - ([^ ]+)",
trans trans
) )
filename = unquote(filename_from_url(image)) filename = text.unquote(text.filename_from_url(image))
name, ext = os.path.splitext(filename) name, ext = os.path.splitext(filename)
return url, { return url, {
"category": info["category"], "category": info["category"],
"chapter-id": self.chapter_id, "chapter-id": self.chapter_id,
"manga": unescape(mmatch.group(1)), "manga": text.unescape(mmatch.group(1)),
"volume": mmatch.group(2) or "", "volume": mmatch.group(2) or "",
"chapter": mmatch.group(3), "chapter": mmatch.group(3),
"page": mmatch.group(4), "page": mmatch.group(4),
"group": tmatch.group(1), "group": tmatch.group(1),
"language": tmatch.group(2), "language": tmatch.group(2),
"title": unescape(title), "title": text.unescape(title),
"image-url": image, "image-url": image,
"name": name, "name": name,
"extension": ext[1:], "extension": ext[1:],

View File

@ -8,15 +8,13 @@
"""Base classes for extractors for danbooru and co""" """Base classes for extractors for danbooru and co"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import json import json
import os.path import os.path
import urllib.parse import urllib.parse
class BooruExtractor(SequentialExtractor): class BooruExtractor(SequentialExtractor):
api_url = "" api_url = ""
@ -24,7 +22,7 @@ class BooruExtractor(SequentialExtractor):
def __init__(self, match, config, info): def __init__(self, match, config, info):
SequentialExtractor.__init__(self, config) SequentialExtractor.__init__(self, config)
self.info = info self.info = info
self.tags = urllib.parse.unquote(match.group(1)) self.tags = text.unquote(match.group(1))
self.page = "page" self.page = "page"
self.params = {"tags": self.tags} self.params = {"tags": self.tags}
self.headers = {} self.headers = {}
@ -58,8 +56,8 @@ class BooruExtractor(SequentialExtractor):
def get_file_metadata(self, data): def get_file_metadata(self, data):
"""Collect metadata for a downloadable file""" """Collect metadata for a downloadable file"""
data["category"] = self.info["category"] data["category"] = self.info["category"]
data["name"] = urllib.parse.unquote( data["name"] = text.unquote(
filename_from_url(self.get_file_url(data)) text.filename_from_url(self.get_file_url(data))
) )
data["extension"] = os.path.splitext(data["name"])[1][1:] data["extension"] = os.path.splitext(data["name"])[1][1:]
return data return data

View File

@ -9,6 +9,7 @@
"""Base classes for extractors for different Futaba Channel boards""" """Base classes for extractors for different Futaba Channel boards"""
from .common import SequentialExtractor, Message from .common import SequentialExtractor, Message
from .. import text
import re import re
class ChanExtractor(SequentialExtractor): class ChanExtractor(SequentialExtractor):
@ -44,5 +45,4 @@ class ChanExtractor(SequentialExtractor):
"""Return thread title from first post""" """Return thread title from first post"""
if "sub" in post: if "sub" in post:
return post["sub"] return post["sub"]
com = re.sub("<[^>]+?>", "", post["com"]) return text.remove_html(post["com"])[:50]
return " ".join(com.split())[:50]

View File

@ -44,24 +44,6 @@ class Extractor():
"Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0" "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
) )
@staticmethod
def extract(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first)
return txt[first:last], last+len(end)
except ValueError:
return None, pos
@staticmethod
def extract_all(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos)
last = txt.index(end, first + len(begin)) + len(end)
return txt[first:last], last
except ValueError:
return None, pos
class SequentialExtractor(Extractor): class SequentialExtractor(Extractor):
@ -123,9 +105,3 @@ def safe_request(session, url, method="GET", *args, **kwargs):
# everything ok -- proceed to download # everything ok -- proceed to download
return r return r
def filename_from_url(url):
pos = url.rfind("/")
return url[pos+1:]
unescape = html.parser.HTMLParser().unescape

View File

@ -8,9 +8,8 @@
"""Extract images from galleries at http://www.imagebam.com/""" """Extract images from galleries at http://www.imagebam.com/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url
info = { info = {
"category": "imagebam", "category": "imagebam",
@ -42,28 +41,28 @@ class ImagebamExtractor(AsynchronousExtractor):
done = False done = False
while not done: while not done:
# get current page # get current page
text = self.request(self.url_base + next_url).text page = self.request(self.url_base + next_url).text
# get url for next page # get url for next page
next_url, pos = self.extract(text, "<a class='buttonblue' href='", "'") next_url, pos = text.extract(page, "<a class='buttonblue' href='", "'")
# if the following text isn't "><span>next image" we are done # if the following text isn't "><span>next image" we are done
if not text.startswith("><span>next image", pos): if not page.startswith("><span>next image", pos):
done = True done = True
# get image url # get image url
img_url, pos = self.extract(text, 'onclick="scale(this);" src="', '"', pos) img_url, pos = text.extract(page, 'onclick="scale(this);" src="', '"', pos)
yield Message.Url, img_url, self.get_file_metadata(img_url) yield Message.Url, img_url, self.get_file_metadata(img_url)
def get_job_metadata(self): def get_job_metadata(self):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
gallery_key = self.match.group(2) gallery_key = self.match.group(2)
text = self.request(self.url_base + "/gallery/" + gallery_key).text page = self.request(self.url_base + "/gallery/" + gallery_key).text
_ , pos = self.extract(text, "<img src='/img/icons/photos.png'", "") _ , pos = text.extract(page, "<img src='/img/icons/photos.png'", "")
title, pos = self.extract(text, "'> ", " <", pos) title, pos = text.extract(page, "'> ", " <", pos)
count, pos = self.extract(text, "'>", " images", pos) count, pos = text.extract(page, "'>", " images", pos)
url , pos = self.extract(text, "<a href='http://www.imagebam.com", "'", pos) url , pos = text.extract(page, "<a href='http://www.imagebam.com", "'", pos)
return { return {
"category": info["category"], "category": info["category"],
"key": gallery_key, "key": gallery_key,
@ -77,5 +76,5 @@ class ImagebamExtractor(AsynchronousExtractor):
self.num += 1 self.num += 1
data = self.metadata.copy() data = self.metadata.copy()
data["num"] = self.num data["num"] = self.num
data["name"] = filename_from_url(url) data["name"] = text.filename_from_url(url)
return data return data

View File

@ -9,6 +9,7 @@
"""Extract images from galleries at http://imgbox.com/""" """Extract images from galleries at http://imgbox.com/"""
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text
import re import re
info = { info = {
@ -36,8 +37,8 @@ class ImgboxExtractor(AsynchronousExtractor):
yield Message.Version, 1 yield Message.Version, 1
yield Message.Directory, self.metadata yield Message.Directory, self.metadata
for match in re.finditer(r'<a href="([^"]+)"><img alt="', page): for match in re.finditer(r'<a href="([^"]+)"><img alt="', page):
text = self.request(self.url_base + match.group(1)).text imgpage = self.request(self.url_base + match.group(1)).text
yield Message.Url, self.get_file_url(text), self.get_file_metadata(text) yield Message.Url, self.get_file_url(imgpage), self.get_file_metadata(imgpage)
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
@ -51,16 +52,16 @@ class ImgboxExtractor(AsynchronousExtractor):
"count": match.group(4), "count": match.group(4),
} }
def get_file_metadata(self, text): def get_file_metadata(self, page):
"""Collect metadata for a downloadable file""" """Collect metadata for a downloadable file"""
data = self.metadata.copy() data = self.metadata.copy()
data["num"] , pos = self.extract(text, '</a> &nbsp; ', ' of ') data["num"] , pos = text.extract(page, '</a> &nbsp; ', ' of ')
data["image-key"], pos = self.extract(text, '/i.imgbox.com/', '?download', pos) data["image-key"], pos = text.extract(page, '/i.imgbox.com/', '?download', pos)
data["name"] , pos = self.extract(text, ' title="', '"', pos) data["name"] , pos = text.extract(page, ' title="', '"', pos)
return data return data
def get_file_url(self, text): def get_file_url(self, page):
"""Extract download-url""" """Extract download-url"""
base = "http://i.imgbox.com/" base = "http://i.imgbox.com/"
path, _ = self.extract(text, base, '"') path, _ = text.extract(page, base, '"')
return base + path return base + path

View File

@ -8,9 +8,8 @@
"""Extract images from albums at http://imgchili.net/""" """Extract images from albums at http://imgchili.net/"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import text
from .common import filename_from_url
import re import re
info = { info = {
@ -42,7 +41,7 @@ class ImgchiliExtractor(SequentialExtractor):
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
title = self.extract(page, "<h1>", "</h1>")[0] title = text.extract(page, "<h1>", "</h1>")[0]
return { return {
"category": info["category"], "category": info["category"],
"title": title, "title": title,

View File

@ -8,10 +8,8 @@
"""Extract manga pages from http://www.mangareader.net/""" """Extract manga pages from http://www.mangareader.net/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from .. import text
from .common import unescape, filename_from_url
from urllib.parse import unquote
import os.path import os.path
import re import re
@ -47,7 +45,7 @@ class MangaReaderExtractor(AsynchronousExtractor):
def get_page_metadata(self, page_url): def get_page_metadata(self, page_url):
"""Collect next url, image-url and metadata for one manga-page""" """Collect next url, image-url and metadata for one manga-page"""
page = self.request(page_url).text page = self.request(page_url).text
extr = self.extract extr = text.extract
width = None width = None
descr, pos = extr(page, '<meta name="description" content="', '"') descr, pos = extr(page, '<meta name="description" content="', '"')
test , pos = extr(page, "document['pu']", '', pos) test , pos = extr(page, "document['pu']", '', pos)
@ -62,13 +60,13 @@ class MangaReaderExtractor(AsynchronousExtractor):
width , pos = extr(page, '<img id="img" width="', '"', pos) width , pos = extr(page, '<img id="img" width="', '"', pos)
height, pos = extr(page, ' height="', '"', pos) height, pos = extr(page, ' height="', '"', pos)
image, pos = extr(page, ' src="', '"', pos) image, pos = extr(page, ' src="', '"', pos)
filename = unquote(filename_from_url(image)) filename = text.unquote(text.filename_from_url(image))
name, ext = os.path.splitext(filename) name, ext = os.path.splitext(filename)
match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr) match = re.match(r"(.*) (\d+) - Read \1 \2 Manga Scans Page (\d+)", descr)
return self.url_base + url, image, { return self.url_base + url, image, {
"category": info["category"], "category": info["category"],
"manga": unescape(match.group(1)), "manga": text.unescape(match.group(1)),
"chapter": match.group(2), "chapter": match.group(2),
"page": match.group(3), "page": match.group(3),
"width": width, "width": width,

View File

@ -8,9 +8,8 @@
"""Extract images from https://nijie.info/""" """Extract images from https://nijie.info/"""
from .common import AsynchronousExtractor from .common import AsynchronousExtractor, Message
from .common import Message from ..text import filename_from_url
from .common import filename_from_url
import re import re
info = { info = {
@ -56,6 +55,7 @@ class NijieExtractor(AsynchronousExtractor):
} }
def get_image_ids(self): def get_image_ids(self):
"""Collect all image-ids for a specific artist"""
text = self.request(self.artist_url).text text = self.request(self.artist_url).text
regex = r'<a href="/view\.php\?id=(\d+)"' regex = r'<a href="/view\.php\?id=(\d+)"'
return [m.group(1) for m in re.finditer(regex, text)] return [m.group(1) for m in re.finditer(regex, text)]

View File

@ -8,8 +8,8 @@
"""Extract images and ugoira from http://www.pixiv.net/""" """Extract images and ugoira from http://www.pixiv.net/"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import text
import re import re
import json import json
@ -84,9 +84,9 @@ class PixivExtractor(SequentialExtractor):
def get_works(self): def get_works(self):
"""Yield all work-items for a pixiv-member""" """Yield all work-items for a pixiv-member"""
page = 1 pagenum = 1
while True: while True:
data = self.api.user_works(self.artist_id, page) data = self.api.user_works(self.artist_id, pagenum)
for work in data["response"]: for work in data["response"]:
url = work["image_urls"]["large"] url = work["image_urls"]["large"]
work["num"] = "" work["num"] = ""
@ -96,17 +96,17 @@ class PixivExtractor(SequentialExtractor):
pinfo = data["pagination"] pinfo = data["pagination"]
if pinfo["current"] == pinfo["pages"]: if pinfo["current"] == pinfo["pages"]:
return return
page = pinfo["next"] pagenum = pinfo["next"]
def parse_ugoira(self, data): def parse_ugoira(self, data):
"""Parse ugoira data""" """Parse ugoira data"""
# get illust page # get illust page
text = self.request( page = self.request(
self.illust_url, params={"illust_id": data["id"]}, self.illust_url, params={"illust_id": data["id"]},
).text ).text
# parse page # parse page
frames, _ = self.extract(text, ',"frames":[', ']') frames, _ = text.extract(page, ',"frames":[', ']')
# build url # build url
url = re.sub( url = re.sub(

View File

@ -8,9 +8,8 @@
"""Extract manga pages from http://manga.redhawkscans.com/""" """Extract manga pages from http://manga.redhawkscans.com/"""
from .common import SequentialExtractor from .common import SequentialExtractor, Message
from .common import Message from .. import text
from .common import unescape
import os.path import os.path
import json import json
import re import re
@ -50,16 +49,16 @@ class RedHawkScansExtractor(SequentialExtractor):
response = self.request(self.url_base + self.part) response = self.request(self.url_base + self.part)
response.encoding = "utf-8" response.encoding = "utf-8"
page = response.text page = response.text
_ , pos = self.extract(page, '<h1 class="tbtitle dnone">', '') _ , pos = text.extract(page, '<h1 class="tbtitle dnone">', '')
manga , pos = self.extract(page, 'title="', '"', pos) manga , pos = text.extract(page, 'title="', '"', pos)
chapter , pos = self.extract(page, '">', '</a>', pos) chapter , pos = text.extract(page, '">', '</a>', pos)
json_data, pos = self.extract(page, 'var pages = ', ';\r\n', pos) json_data, pos = text.extract(page, 'var pages = ', ';\r\n', pos)
match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter) match = re.match(r"(Chapter (\d+)([^:+]*)(?:: (.*))?|[^:]+)", chapter)
return { return {
"category": info["category"], "category": info["category"],
"manga": unescape(manga), "manga": text.unescape(manga),
"chapter": match.group(2) or match.group(1), "chapter": match.group(2) or match.group(1),
"chapter-minor": match.group(3) or "", "chapter-minor": match.group(3) or "",
"language": "English", "language": "English",
"title": unescape(match.group(4) or ""), "title": text.unescape(match.group(4) or ""),
}, json.loads(json_data) }, json.loads(json_data)