diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 033cf5f3..af906731 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -43,6 +43,17 @@ def remove_html(txt):
return ""
+def split_html(txt, sep=None):
+ """Split input string by html-tags"""
+ try:
+ return [
+ x for x in re.split("<[^>]+>", txt)
+ if x and not x.isspace()
+ ]
+ except TypeError:
+ return []
+
+
def filename_from_url(url):
"""Extract the last part of an url to use as a filename"""
try:
diff --git a/test/test_text.py b/test/test_text.py
index f76fc2a2..697d83b0 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -64,6 +64,30 @@ class TestText(unittest.TestCase):
for value in INVALID:
self.assertEqual(f(value), "")
+ def test_split_html(self, f=text.split_html):
+ result = ["Hello", "World."]
+ empty = []
+
+ # standard usage
+ self.assertEqual(f(""), empty)
+ self.assertEqual(f("Hello World."), ["Hello World."])
+ self.assertEqual(f(" Hello World. "), [" Hello World. "])
+ self.assertEqual(f("Hello
World."), result)
+ self.assertEqual(
+ f("