968d3e8465
'/?&#' -> '/?#' and '?&#' -> '?#' According to https://www.ietf.org/rfc/rfc3986.txt, URLs are "organized hierarchically" by using "the slash ("/"), question mark ("?"), and number sign ("#") characters to delimit components"
74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2019-2020 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://www.lineblog.me/"""
|
|
|
|
from .livedoor import LivedoorBlogExtractor, LivedoorPostExtractor
|
|
from .. import text
|
|
|
|
|
|
class LineblogBase():
|
|
"""Base class for lineblog extractors"""
|
|
category = "lineblog"
|
|
root = "https://lineblog.me"
|
|
|
|
def _images(self, post):
|
|
imgs = []
|
|
body = post.pop("body")
|
|
|
|
for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
|
|
src = text.extract(img, 'src="', '"')[0]
|
|
alt = text.extract(img, 'alt="', '"')[0]
|
|
|
|
if not src:
|
|
continue
|
|
if src.startswith("https://obs.line-scdn.") and src.count("/") > 3:
|
|
src = src.rpartition("/")[0]
|
|
|
|
imgs.append(text.nameext_from_url(alt or src, {
|
|
"url" : src,
|
|
"num" : num,
|
|
"hash": src.rpartition("/")[2],
|
|
"post": post,
|
|
}))
|
|
|
|
return imgs
|
|
|
|
|
|
class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor):
|
|
"""Extractor for a user's blog on lineblog.me"""
|
|
pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?#])"
|
|
test = ("https://lineblog.me/mamoru_miyano/", {
|
|
"range": "1-20",
|
|
"count": 20,
|
|
"pattern": r"https://obs.line-scdn.net/[\w-]+$",
|
|
"keyword": {
|
|
"post": {
|
|
"categories" : tuple,
|
|
"date" : "type:datetime",
|
|
"description": str,
|
|
"id" : int,
|
|
"tags" : list,
|
|
"title" : str,
|
|
"user" : "mamoru_miyano"
|
|
},
|
|
"filename": str,
|
|
"hash" : r"re:\w{32,}",
|
|
"num" : int,
|
|
},
|
|
})
|
|
|
|
|
|
class LineblogPostExtractor(LineblogBase, LivedoorPostExtractor):
|
|
"""Extractor for blog posts on lineblog.me"""
|
|
pattern = r"(?:https?://)?lineblog\.me/(\w+)/archives/(\d+)"
|
|
test = ("https://lineblog.me/mamoru_miyano/archives/1919150.html", {
|
|
"url": "24afeb4044c554f80c374b52bf8109c6f1c0c757",
|
|
"keyword": "76a38e2c0074926bd3362f66f9fc0e6c41591dcb",
|
|
})
|