gallery-dl/gallery_dl/extractor/hitomi.py

# -*- coding: utf-8 -*-

# Copyright 2015-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://hitomi.la/"""

from .common import GalleryExtractor
from .. import text, util
import string
import json


class HitomiGalleryExtractor(GalleryExtractor):
    """Extractor for image galleries from hitomi.la"""
    category = "hitomi"
    root = "https://hitomi.la"
    pattern = (r"(?:https?://)?hitomi\.la"
               r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)"
               r"/(?:[^/?&#]+-)?(\d+)")
    test = (
        ("https://hitomi.la/galleries/867789.html", {
            "pattern": r"https://aa.hitomi.la/galleries/867789/\d+.jpg",
            "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
            "count": 16,
        }),
        ("https://hitomi.la/galleries/1401410.html", {
            # download test
            "range": "1",
            "content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",
        }),
        ("https://hitomi.la/galleries/733697.html", {
            # Game CG with scenes (#321)
            "url": "c2a84185f467450b8b9b72fbe40c0649029ce007",
            "count": 210,
        }),
        ("https://hitomi.la/galleries/1045954.html", {
            # fallback for galleries only available through /reader/ URLs
            "url": "055c898a36389719799d6bce76889cc4ea4421fc",
            "count": 1413,
        }),
        ("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"),
        ("https://hitomi.la/manga/867789.html"),
        ("https://hitomi.la/doujinshi/867789.html"),
        ("https://hitomi.la/cg/867789.html"),
        ("https://hitomi.la/gamecg/867789.html"),
        ("https://hitomi.la/reader/867789.html"),
    )

    def __init__(self, match):
        self.gallery_id = match.group(1)
        self.fallback = False
        url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
        GalleryExtractor.__init__(self, match, url)

    def request(self, url, **kwargs):
        response = GalleryExtractor.request(self, url, fatal=False, **kwargs)
        if response.status_code == 404:
            self.fallback = True
            url = url.replace("/galleries/", "/reader/")
            response = GalleryExtractor.request(self, url, **kwargs)
        elif b"<title>Redirect</title>" in response.content:
            url = text.extract(response.text, "href='", "'")[0]
            if not url.startswith("http"):
                url = text.urljoin(self.root, url)
            response = self.request(url, **kwargs)
        return response

    def metadata(self, page):
        if self.fallback:
            return {
                "gallery_id": text.parse_int(self.gallery_id),
                "title": text.unescape(text.extract(
                    page, "<title>", "<")[0].rpartition(" | ")[0]),
            }

        extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
        data = {
            "gallery_id": text.parse_int(self.gallery_id),
            "title"     : text.unescape(extr('.html">', '<').strip()),
            "artist"    : self._prep(extr('<h2>', '</h2>')),
            "group"     : self._prep(extr('<td>Group</td><td>', '</td>')),
            "type"      : self._prep_1(extr('<td>Type</td><td>', '</td>')),
            "language"  : self._prep_1(extr('<td>Language</td><td>', '</td>')),
            "parody"    : self._prep(extr('<td>Series</td><td>', '</td>')),
            "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
            "tags"      : self._prep(extr('<td>Tags</td><td>', '</td>')),
            "date"      : self._date(extr('<span class="date">', '</span>')),
        }
        if data["language"] == "N/a":
            data["language"] = None
        data["lang"] = util.language_to_code(data["language"])
        return data

    def images(self, page):
        # see https://ltn.hitomi.la/common.js
        offset = text.parse_int(self.gallery_id[-1]) % 3
        subdomain = chr(97 + offset) + "a"
        base = "https://{}.hitomi.la/galleries/{}/".format(
            subdomain, self.gallery_id)

        # set Referer header before image downloads (#239)
        self.session.headers["Referer"] = self.gallery_url

        # get 'galleryinfo'
        url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id)
        page = self.request(url).text

        return [
            (base + image["name"], None)
            for image in json.loads(page.partition("=")[2])
        ]

    @staticmethod
    def _prep(value):
        return [
            text.unescape(string.capwords(v))
            for v in text.extract_iter(value or "", '.html">', '<')
        ]

    @staticmethod
    def _prep_1(value):
        return text.remove_html(value).capitalize()

    @staticmethod
    def _date(value):
        return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00			`# -- coding: utf-8 --`

simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`# Copyright 2015-2019 Mike Fährmann`
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract images from https://hitomi.la/"""`

use GalleryExtractor as common base class 2019-02-26 14:08:02 +01:00			`from .common import GalleryExtractor`
move code into util.py 2017-03-28 13:12:44 +02:00			`from .. import text, util`
[hitomi] more metadata 2015-10-29 17:53:29 +01:00			`import string`
[hitomi] fix extraction 2019-10-29 16:23:20 +01:00			`import json`
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
use GalleryExtractor as common base class 2019-02-26 14:08:02 +01:00			`class HitomiGalleryExtractor(GalleryExtractor):`
consistent extractor naming scheme + docstrings 2016-09-12 10:20:57 +02:00			`"""Extractor for image galleries from hitomi.la"""`
update all other extractors 2015-11-21 04:26:30 +01:00			`category = "hitomi"`
[hitomi] handle Game CG galleries with scenes (fixes #321) 2019-06-27 20:25:40 +02:00			`root = "https://hitomi.la"`
[hitomi] extend URL pattern + follow redirects 2019-11-01 21:40:10 +01:00			`pattern = (r"(?:https?://)?hitomi\.la"`
			`r"/(?:manga\|doujinshi\|cg\|gamecg\|galleries\|reader)"`
			`r"/(?:[^/?&#]+-)?(\d+)")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`test = (`
[hitomi] fix image URLs and improve metadata - use '?a.hitomi.la' as subdomain depending in gallery-id - add 'characters', 'tags' and 'date' information - support multiple entires per metadata-value - rename 'num' to 'page' 2018-03-20 17:36:06 +01:00			`("https://hitomi.la/galleries/867789.html", {`
[hitomi] fix image URLs 2019-10-09 17:21:37 +02:00			`"pattern": r"https://aa.hitomi.la/galleries/867789/\d+.jpg",`
update extractor class hierarchies - let the GalleryExtractor class inherit directly from Extractor - make ChapterExtractor a subclass of GalleryExtractor - change enumeration field names of GalleryExtractors to 'num' 2019-10-16 18:12:07 +02:00			`"keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",`
[hitomi] fix image URLs 2019-10-09 17:21:37 +02:00			`"count": 16,`
[hitomi] fix image subdomains (closes #142) galleries with an ID ending in 1 need some special treatment 2018-12-14 16:15:06 +01:00			`}),`
[hitomi] set Referer header (fixes #239) 2019-05-01 10:54:42 +02:00			`("https://hitomi.la/galleries/1401410.html", {`
			`# download test`
			`"range": "1",`
			`"content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",`
			`}),`
[hitomi] handle Game CG galleries with scenes (fixes #321) 2019-06-27 20:25:40 +02:00			`("https://hitomi.la/galleries/733697.html", {`
			`# Game CG with scenes (#321)`
			`"url": "c2a84185f467450b8b9b72fbe40c0649029ce007",`
			`"count": 210,`
			`}),`
[hitomi] fallback to /reader/ page if main page returns 404 Some galleries return a 404: Not Found error when trying to access them through the main gallery URL, but their content is still available on the respective /reader/ page. 2019-10-11 18:25:54 +02:00			`("https://hitomi.la/galleries/1045954.html", {`
			`# fallback for galleries only available through /reader/ URLs`
			`"url": "055c898a36389719799d6bce76889cc4ea4421fc",`
			`"count": 1413,`
			`}),`
[hitomi] extend URL pattern + follow redirects 2019-11-01 21:40:10 +01:00			`("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"),`
			`("https://hitomi.la/manga/867789.html"),`
			`("https://hitomi.la/doujinshi/867789.html"),`
			`("https://hitomi.la/cg/867789.html"),`
			`("https://hitomi.la/gamecg/867789.html"),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 2019-02-08 13:45:40 +01:00			`("https://hitomi.la/reader/867789.html"),`
			`)`
update all other extractors 2015-11-21 04:26:30 +01:00
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00			`def __init__(self, match):`
[hitomi] fix image URLs 2019-10-09 17:21:37 +02:00			`self.gallery_id = match.group(1)`
[hitomi] fallback to /reader/ page if main page returns 404 Some galleries return a 404: Not Found error when trying to access them through the main gallery URL, but their content is still available on the respective /reader/ page. 2019-10-11 18:25:54 +02:00			`self.fallback = False`
[hitomi] handle Game CG galleries with scenes (fixes #321) 2019-06-27 20:25:40 +02:00			`url = "{}/galleries/{}.html".format(self.root, self.gallery_id)`
use GalleryExtractor as common base class 2019-02-26 14:08:02 +01:00			`GalleryExtractor.__init__(self, match, url)`
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00
[hitomi] fallback to /reader/ page if main page returns 404 Some galleries return a 404: Not Found error when trying to access them through the main gallery URL, but their content is still available on the respective /reader/ page. 2019-10-11 18:25:54 +02:00			`def request(self, url, **kwargs):`
			`response = GalleryExtractor.request(self, url, fatal=False, **kwargs)`
			`if response.status_code == 404:`
			`self.fallback = True`
			`url = url.replace("/galleries/", "/reader/")`
			`response = GalleryExtractor.request(self, url, **kwargs)`
[hitomi] extend URL pattern + follow redirects 2019-11-01 21:40:10 +01:00			`elif b"<title>Redirect</title>" in response.content:`
			`url = text.extract(response.text, "href='", "'")[0]`
			`if not url.startswith("http"):`
			`url = text.urljoin(self.root, url)`
			`response = self.request(url, **kwargs)`
[hitomi] fallback to /reader/ page if main page returns 404 Some galleries return a 404: Not Found error when trying to access them through the main gallery URL, but their content is still available on the respective /reader/ page. 2019-10-11 18:25:54 +02:00			`return response`

change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`def metadata(self, page):`
[hitomi] fallback to /reader/ page if main page returns 404 Some galleries return a 404: Not Found error when trying to access them through the main gallery URL, but their content is still available on the respective /reader/ page. 2019-10-11 18:25:54 +02:00			`if self.fallback:`
			`return {`
			`"gallery_id": text.parse_int(self.gallery_id),`
			`"title": text.unescape(text.extract(`
			`page, "<title>", "<")[0].rpartition(" \| ")[0]),`
			`}`

[hitomi] simplify data extraction code 2019-05-01 11:14:21 +02:00			`extr = text.extract_from(page, page.index('<h1><a href="/reader/'))`
			`data = {`
[hitomi] fix image URLs 2019-10-09 17:21:37 +02:00			`"gallery_id": text.parse_int(self.gallery_id),`
[hitomi] simplify data extraction code 2019-05-01 11:14:21 +02:00			`"title" : text.unescape(extr('.html">', '<').strip()),`
			`"artist" : self._prep(extr('<h2>', '</h2>')),`
			`"group" : self._prep(extr('<td>Group</td><td>', '</td>')),`
			`"type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),`
			`"language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),`
			`"parody" : self._prep(extr('<td>Series</td><td>', '</td>')),`
			`"characters": self._prep(extr('<td>Characters</td><td>', '</td>')),`
			`"tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),`
[hitomi] fix empty language detection; parse datetime 2019-06-17 19:59:43 +02:00			`"date" : self._date(extr('<span class="date">', '</span>')),`
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00			`}`
[hitomi] fix empty language detection; parse datetime 2019-06-17 19:59:43 +02:00			`if data["language"] == "N/a":`
[hitomi] simplify data extraction code 2019-05-01 11:14:21 +02:00			`data["language"] = None`
			`data["lang"] = util.language_to_code(data["language"])`
			`return data`
add extractor 'hitomi' 2015-10-28 16:24:35 +01:00
change Chapter and MangaExtractor classes - unify and simplify constructors - rename get_metadata and get_images to just metadata() and images() - rename self.url to chapter_url and manga_url 2019-02-11 18:38:47 +01:00			`def images(self, page):`
[hitomi] fix image subdomains (closes #142) galleries with an ID ending in 1 need some special treatment 2018-12-14 16:15:06 +01:00			`# see https://ltn.hitomi.la/common.js`
[hitomi] fix image URLs 2019-10-09 17:21:37 +02:00			`offset = text.parse_int(self.gallery_id[-1]) % 3`
[hitomi] fix image subdomains (closes #142) galleries with an ID ending in 1 need some special treatment 2018-12-14 16:15:06 +01:00			`subdomain = chr(97 + offset) + "a"`
[hitomi] fix extraction 2019-10-29 16:23:20 +01:00			`base = "https://{}.hitomi.la/galleries/{}/".format(`
			`subdomain, self.gallery_id)`
[hitomi] fix image subdomains (closes #142) galleries with an ID ending in 1 need some special treatment 2018-12-14 16:15:06 +01:00
[hitomi] set Referer header (fixes #239) 2019-05-01 10:54:42 +02:00			`# set Referer header before image downloads (#239)`
update extractor class hierarchies - let the GalleryExtractor class inherit directly from Extractor - make ChapterExtractor a subclass of GalleryExtractor - change enumeration field names of GalleryExtractors to 'num' 2019-10-16 18:12:07 +02:00			`self.session.headers["Referer"] = self.gallery_url`
[hitomi] set Referer header (fixes #239) 2019-05-01 10:54:42 +02:00
[hitomi] fix extraction 2019-10-29 16:23:20 +01:00			`# get 'galleryinfo'`
			`url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id)`
			`page = self.request(url).text`
[hitomi] handle Game CG galleries with scenes (fixes #321) 2019-06-27 20:25:40 +02:00
[hitomi] fix some keywords 2016-09-23 08:21:49 +02:00			`return [`
[hitomi] fix extraction 2019-10-29 16:23:20 +01:00			`(base + image["name"], None)`
			`for image in json.loads(page.partition("=")[2])`
[hitomi] fix some keywords 2016-09-23 08:21:49 +02:00			`]`
[hitomi] fix image URLs and improve metadata - use '?a.hitomi.la' as subdomain depending in gallery-id - add 'characters', 'tags' and 'date' information - support multiple entires per metadata-value - rename 'num' to 'page' 2018-03-20 17:36:06 +01:00
			`@staticmethod`
[hitomi] simplify data extraction code 2019-05-01 11:14:21 +02:00			`def _prep(value):`
adjust metadata types for GalleryExtractors 2019-03-01 23:13:40 +01:00			`return [`
			`text.unescape(string.capwords(v))`
			`for v in text.extract_iter(value or "", '.html">', '<')`
			`]`
[hitomi] simplify data extraction code 2019-05-01 11:14:21 +02:00
			`@staticmethod`
			`def _prep_1(value):`
			`return text.remove_html(value).capitalize()`
[hitomi] fix empty language detection; parse datetime 2019-06-17 19:59:43 +02:00
			`@staticmethod`
			`def _date(value):`
			`return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")`