[mangadex] add chapter- and manga-extractor

This commit is contained in:
Mike Fährmann 2018-03-05 18:37:21 +01:00
parent b58449fd88
commit 749fbbfa6c
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
8 changed files with 293 additions and 1 deletions

View File

@ -1,5 +1,7 @@
# Changelog
## Unreleased
## 1.3.0 - 2018-03-02
- Added `--proxy` to explicitly specify a proxy server ([#76](https://github.com/mikf/gallery-dl/issues/76))
- Added options to customize [archive ID formats](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorarchive-format) and [undefined replacement fields](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorkeywords-default)

View File

@ -0,0 +1,136 @@
{
"base-directory": "/tmp/",
"netrc": false,
"downloader":
{
"part": true,
"part-directory": null,
"http":
{
"rate": null,
"retries": 5,
"timeout": 30,
"verify": true
}
},
"extractor":
{
"archive": null,
"proxy": null,
"skip": true,
"sleep": 0,
"pixiv":
{
"user":
{
"directory": ["{category}", "{user[id]}"]
},
"bookmark":
{
"directory": ["{category}", "my bookmarks"]
},
"ugoira": true,
"username": null,
"password": null
},
"batoto":
{
"username": null,
"password": null
},
"exhentai":
{
"wait-min": 3,
"wait-max": 6,
"original": true,
"username": null,
"password": null,
"cookies": {
"igneous": null,
"s": null,
"yay": "louder"
}
},
"nijie":
{
"username": null,
"password": null
},
"sankaku":
{
"wait-min": 2,
"wait-max": 4,
"username": null,
"password": null
},
"seiga":
{
"username": null,
"password": null
},
"gelbooru":
{
"filename": "{category}_{id:>07}_{md5}.{extension}",
"api": true
},
"reddit":
{
"refresh-token": null,
"comments": 500,
"morecomments": false,
"date-min": 0,
"date-max": 253402210800,
"date-format": "%Y-%m-%dT%H:%M:%S",
"id-min": "0",
"id-max": "ZIK0ZJ",
"recursion": 0
},
"flickr":
{
"access-token": null,
"access-token-secret": null,
"metadata": false,
"size-max": null
},
"deviantart":
{
"refresh-token": null,
"flat": true,
"mature": true,
"original": true
},
"gfycat":
{
"format": "mp4"
},
"imgur":
{
"mp4": true
},
"tumblr":
{
"posts": "photo",
"inline": false,
"reblogs": true,
"external": false
},
"recursive":
{
"blacklist": ["directlink", "oauth", "recursive", "test"]
},
"oauth":
{
"browser": true
}
},
"output":
{
"mode": "auto",
"shorten": true,
"progress": true,
"logfile": null,
"unsupportedfile": null
}
}

View File

@ -47,6 +47,7 @@ Luscious https://luscious.net/ Albums
Manga Fox http://fanfox.net/ Chapters
Manga Here http://www.mangahere.co/ Chapters, Manga
Manga Stream https://mangastream.com/ Chapters
Mangadex https://mangadex.org/ Chapters, Manga
Mangapanda https://www.mangapanda.com/ Chapters, Manga
MangaPark https://mangapark.me/ Chapters, Manga
Mangareader https://www.mangareader.net/ Chapters, Manga

View File

@ -51,6 +51,7 @@ modules = [
"konachan",
"loveisover",
"luscious",
"mangadex",
"mangafox",
"mangahere",
"mangapanda",

View File

@ -0,0 +1,148 @@
# -*- coding: utf-8 -*-
# Copyright 2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract manga-chapters and entire manga from https://mangadex.org/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
from urllib.parse import urljoin
import json
import re
class MangadexExtractor():
"""Base class for mangadex extractors"""
category = "mangadex"
root = "https://mangadex.org"
class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
"""Extractor for manga-chapters from mangadex.org"""
pattern = [r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"]
test = [
("https://mangadex.org/chapter/122094", {
"keyword": "b4c83fe41f125eae745c2e00d29e087cc4eb78df",
"content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
}),
# oneshot
("https://mangadex.org/chapter/138086", {
"count": 64,
"keyword": "9b1b7292f7dbcf10983fbdc34b8cdceeb47328ee",
}),
]
def __init__(self, match):
self.chapter_id = match.group(1)
url = self.root + "/chapter/" + self.chapter_id
ChapterExtractor.__init__(self, url)
def get_metadata(self, page):
info , pos = text.extract(page, '="og:title" content="', '"')
manga_id, pos = text.extract(page, '/images/manga/', '.', pos)
_ , pos = text.extract(page, ' id="jump_group"', '', pos)
_ , pos = text.extract(page, ' selected ', '', pos)
language, ___ = text.extract(page, " title='", "'", pos-100)
group , pos = text.extract(page, '>', '<', pos)
info = text.unescape(info)
match = re.match(
r"(?:(?:Vol\. (\d+) )?Ch\. (\d+)([^ ]*)|(.*)) "
r"\(([^)]+)\)",
info)
return {
"manga": match.group(5),
"manga_id": util.safe_int(manga_id),
"volume": util.safe_int(match.group(1)),
"chapter": util.safe_int(match.group(2)),
"chapter_minor": match.group(3) or "",
"chapter_id": util.safe_int(self.chapter_id),
"chapter_string": info.rstrip(" - MangaDex"),
"group": text.unescape(group),
"lang": util.language_to_code(language),
"language": language,
}
def get_images(self, page):
dataurl , pos = text.extract(page, "var dataurl = '", "'")
pagelist, pos = text.extract(page, "var page_array = [", "]", pos)
server , pos = text.extract(page, "var server = '", "'", pos)
base = urljoin(self.root, server + dataurl + "/")
return [
(base + page, None)
for page in json.loads(
"[" + pagelist.replace("'", '"').rstrip(",") + "]"
)
]
class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
"""Extractor for manga from mangadex.org"""
pattern = [r"(?:https?://)?(?:www\.)?(mangadex\.(?:org|com)/manga/\d+)"]
test = [
("https://mangadex.org/manga/2946/souten-no-koumori", {
"url": "9e77934759828458d0424473922e41f348719472",
"keywords": {
"manga": "Souten no Koumori",
"manga_id": 2946,
"title": "Oneshot",
"volume": int,
"chapter": int,
"chapter_minor": str,
"chapter_id": int,
"group": str,
"contributor": str,
"date": str,
"views": int,
"lang": str,
"language": str,
},
}),
]
def chapters(self, page):
results = []
extr = text.extract
manga = text.unescape(extr(
page, '"og:title" content="', '"')[0].rpartition(" (")[0])
manga_id = util.safe_int(extr(
page, '/images/manga/', '.')[0])
for info in text.extract_iter(page, "<tr id=", "</tr>"):
chid , pos = extr(info, 'data-chapter-id="', '"')
chapter , pos = extr(info, 'data-chapter-num="', '"', pos)
volume , pos = extr(info, 'data-volume-num="', '"', pos)
title , pos = extr(info, 'data-chapter-name="', '"', pos)
language, pos = extr(info, " title='", "'", pos)
group , pos = extr(info, "<td>", "</td>", pos)
user , pos = extr(info, "<td>", "</td>", pos)
views , pos = extr(info, ">", "<", pos)
date , pos = extr(info, ' datetime="', '"', pos)
chapter, sep, minor = chapter.partition(".")
results.append((self.root + "/chapter/" + chid, {
"manga": manga,
"manga_id": util.safe_int(manga_id),
"title": text.unescape(title),
"volume": util.safe_int(volume),
"chapter": util.safe_int(chapter),
"chapter_minor": sep + minor,
"chapter_id": util.safe_int(chid),
"group": text.unescape(text.remove_html(group)),
"contributor": text.remove_html(user),
"views": util.safe_int(views),
"date": date,
"lang": util.language_to_code(language),
"language": language,
}))
return results

View File

@ -155,6 +155,8 @@ def language_to_code(lang, default=None):
CODES = {
"ar": "Arabic",
"bg": "Bulgarian",
"ca": "Catalan",
"cs": "Czech",
"da": "Danish",
"de": "German",

View File

@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
__version__ = "1.3.0"
__version__ = "1.3.1-dev"

View File

@ -18,7 +18,9 @@ SKIP = {
"archivedmoe", "archiveofsins", "thebarchive",
# temporary issues
"imgchili",
"powermanga",
"pinterest",
}