[slideshare] fix extraction

This commit is contained in:
Mike Fährmann 2021-04-15 17:15:59 +02:00
parent bef3105121
commit de14b7ad7a
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extract images from https://www.slideshare.net/""" """Extractors for https://www.slideshare.net/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text
@ -58,15 +58,16 @@ class SlidesharePresentationExtractor(Extractor):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
descr, pos = text.extract( descr, pos = text.extract(
page, '<meta name="description" content="', '"') page, '<meta name="description" content="', '"')
category, pos = text.extract(
page, '<div class="metadata-item">', '</div>', pos)
views, pos = text.extract(
page, '<div class="metadata-item">', '</div>', pos)
published, pos = text.extract(
page, '<div class="metadata-item">', '</div>', pos)
title, pos = text.extract( title, pos = text.extract(
page, '<span class="j-title-breadcrumb">', '</span>', pos) page, '<span class="j-title-breadcrumb">', '</span>', pos)
views, pos = text.extract(
page, '<span class="notranslate">', 'views<', pos)
published, pos = text.extract(
page, '<time datetime="', '"', pos)
alt_descr, pos = text.extract( alt_descr, pos = text.extract(
page, 'id="slideshow-description-paragraph" class="notranslate">', page, '<p class="slideshow-description notranslate">', '</p>', pos)
'</p>', pos)
if descr.endswith("") and alt_descr: if descr.endswith("") and alt_descr:
descr = text.remove_html(alt_descr).strip() descr = text.remove_html(alt_descr).strip()
@ -76,8 +77,9 @@ class SlidesharePresentationExtractor(Extractor):
"presentation": self.presentation, "presentation": self.presentation,
"title": text.unescape(title.strip()), "title": text.unescape(title.strip()),
"description": text.unescape(descr), "description": text.unescape(descr),
"views": text.parse_int(views.replace(",", "")), "views": text.parse_int(views.rpartition(
"published": published, " views")[0].replace(",", "")),
"published": published.strip(),
} }
@staticmethod @staticmethod